diff --git a/mypaper-final.tex b/mypaper-final.tex index 059a6b4e88efaa4e390bd780e6bd28ca1228230f..b79344ef04fb09c4939ae467c958e25ba1f3af46 100644 --- a/mypaper-final.tex +++ b/mypaper-final.tex @@ -205,11 +205,11 @@ compromising precision, describing and classifying relevant documents that are not amenable to filtering , and estimating the upper-bound of recall on entity-based filtering. -<<<<<<< HEAD + The rest of the paper is organized as follows. Section \ref{sec:desc} describes the dataset and section \ref{sec:fil} defines the task. In section \ref{sec:lit}, we discuss related litrature folowed by a discussion of our method in \ref{sec:mthd}. Following that, we present the experimental resulsy in \ref{sec:expr}, and discuss and analyze them in \ref{sec:analysis}. Towards the end, we discuss the impact of filtering choices on classification in section \ref{sec:impact}, examine and categorize unfilterable docuemnts in section \ref{sec:unfil}. Finally, we present our conclusions in \ref{sec:conc}. -======= -The rest of the paper is organized as follows. Section \ref{sec:desc} describes the dataset and section \ref{sec:fil} defines the task. In section \ref{sec:lit}, we discuss related litrature folowed by a discussion of our method in \ref{sec:mthd}. Following that, we present the experimental resulsy in \ref{sec:expr}, and discuss and analyze them in \ref{sec:analysis}. Towards the end, we discuss the impact of filtering choices on classification in section \ref{sec:impact}, examine and categorize unfilterable documents in section \ref{sec:unfil}. Finally, we present our conclusions in \ref{}{sec:conc}. ->>>>>>> 51b8586f2e1def3777b3e65737b7ab32c2ff0981 + +The rest of the paper is organized as follows. Section \ref{sec:desc} describes the dataset and section \ref{sec:fil} defines the task. In section \ref{sec:lit}, we discuss related literature folowed by a discussion of our method in \ref{sec:mthd}. Following that, we present the experimental resulsy in \ref{sec:expr}, and discuss and analyze them in \ref{sec:analysis}. Towards the end, we discuss the impact of filtering choices on classification in section \ref{sec:impact}, examine and categorize unfilterable documents in section \ref{sec:unfil}. Finally, we present our conclusions in \ref{}{sec:conc}. + \section{Data Description}\label{sec:desc} @@ -367,7 +367,11 @@ and scripts \cite{frank2013stream} to report max-F, the maximum F-score obtained over all relevance cut-offs. \section{Literature Review} \label{sec:lit} -There has been a great deal of interest as of late on entity-based filtering and ranking. One manifestation of that is the introduction of TREC KBA in 2012. Following that, there have been a number of research works done on the topic \cite{frank2012building, ceccarelli2013learning, taneva2013gem, wang2013bit, balog2013multi}. These works are based on KBA 2012 task and dataset and they address the whole problem of entity filtering and ranking. TREC KBA continued in 2013, but the task underwent some changes. The main change between the 2012 and 2013 are in the number of entities, the type of entities, the corpus and the relevance rankings. + +There has been a great deal of interest as of late on entity-based filtering and ranking. The Text Analysis Conference started Knwoledge Base Population with the goal of developing methods and technologies to fascilitate the creation and population of KBs \cite{ji2011knowledge}. The most relevant track in KBP is entity-linking: given an entity and +a document containing a mention of the entity, identify the mention in the document and link it to the its profile in a KB. Many studies have attempted to address this task \cite{dalton2013neighborhood, dredze2010entity, davis2012named}. + + A more recent manifestation of that is the introduction of TREC KBA in 2012. Following that, there have been a number of research works done on the topic \cite{frank2012building, ceccarelli2013learning, taneva2013gem, wang2013bit, balog2013multi}. These works are based on KBA 2012 task and dataset and they address the whole problem of entity filtering and ranking. TREC KBA continued in 2013, but the task underwent some changes. The main change between the 2012 and 2013 are in the number of entities, the type of entities, the corpus and the relevance rankings. The number of entities increased from 29 to 141, and it included 20 Twitter entities. The TREC KBA 2012 corpus is 1.9TB after xz-compression and has 400M documents. By contrast, the KBA 2013 corpus is 6.45 after XZ-compression and GPG encryption. A version with all-non English documented removed is 4.5 TB and consists of 1 Billion documents. The 2013 corpus subsumed the 2012 corpus and added others from spinn3r, namely main-stream news, forum, arxiv, classified, reviews and meme-tracker. A more important difference is, however, a change in the definitions of relevance ratings vital and relevant. While in KBA 2012, a document was judged vital if it has citation-worthy content for a given entity, in 2013 it must have the freshliness, that is the content must trigger an editing of the given entity's KB entry. diff --git a/sigproc.bib b/sigproc.bib index 5a0b3541c12c25805ecf5b10aadbef7bdb7854f5..e4f16cece764a14444b5234b085ec1dd72f8695f 100644 --- a/sigproc.bib +++ b/sigproc.bib @@ -140,4 +140,48 @@ booktitle={LREC}, pages={3168--3175}, year={2012} +} + +@inproceedings{ji2011knowledge, + title={Knowledge base population: Successful approaches and challenges}, + author={Ji, Heng and Grishman, Ralph}, + booktitle={Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1}, + pages={1148--1158}, + year={2011}, + organization={Association for Computational Linguistics} +} + +@techreport{singh12:wiki-links, + author = {Sameer Singh and Amarnag Subramanya and Fernando Pereira and Andrew McCallum}, + title = {Wikilinks: A Large-scale Cross-Document Coreference Corpus Labeled via Links to {Wikipedia}}, + institute = {University of Massachusetts, Amherst}, + number = {UM-CS-2012-015}, + year = {2012} +} + +@inproceedings{dredze2010entity, + title={Entity disambiguation for knowledge base population}, + author={Dredze, Mark and McNamee, Paul and Rao, Delip and Gerber, Adam and Finin, Tim}, + booktitle={Proceedings of the 23rd International Conference on Computational Linguistics}, + pages={277--285}, + year={2010}, + organization={Association for Computational Linguistics} +} + +@inproceedings{dalton2013neighborhood, + title={A neighborhood relevance model for entity linking}, + author={Dalton, Jeffrey and Dietz, Laura}, + booktitle={Proceedings of the 10th Conference on Open Research Areas in Information Retrieval}, + pages={149--156}, + year={2013}, + organization={LE CENTRE DE HAUTES ETUDES INTERNATIONALES D'INFORMATIQUE DOCUMENTAIRE} +} + +@inproceedings{davis2012named, + title={Named entity disambiguation in streaming data}, + author={Davis, Alexandre and Veloso, Adriano and da Silva, Altigran S and Meira Jr, Wagner and Laender, Alberto HF}, + booktitle={Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1}, + pages={815--824}, + year={2012}, + organization={Association for Computational Linguistics} } \ No newline at end of file