// // PLMRetEval // // 13 November 2010 -- Yuanhua Lv @ UIUC (ylv2@uiuc.edu) // // #include "common_headers.hpp" #include "IndexManager.hpp" #include "BasicDocStream.hpp" #include "RetMethodManager.hpp" #include "ResultFile.hpp" #include "RelDocUnigramCounter.hpp" using namespace lemur::api; using namespace lemur::retrieval; const double pi = 3.14159265358979323846; namespace LocalParameter { // Smoothing Strategy: 0 for linear smoothing while 1 for dirichlet prior smoothing int PLMSmoothMethod; // Jelinek-Merce Smoothing parameter double PLMJMLambda; // Dirichlet Prior double PLMDirPrior; // Number of documents to be re-ranked int RerankedDocCount; // Propagation function (kernel) int PropFunction; // Standard variance for kernel density estimation, i.e., sigma double PLMSigma; // Score interpolation coefficient, i.e., weight of PLM, when interpolating plm based score with the original score double PLMCoefficient; // 0 : apply plm on single term query; 1 : do not apply plm on single term query int IgnoreSingleTermQuery; void get() { PLMSigma = ParamGetDouble("PLMSigma", 175); PLMSmoothMethod = ParamGetInt("PLMSmoothMethod", 1); PLMJMLambda = ParamGetDouble("PLMJMLambda", 0.5); PLMDirPrior = ParamGetDouble("PLMDirPrior", 500); RerankedDocCount = ParamGetInt("RerankedDocCount", 2000); // Default: Gaussian Kernel PropFunction = ParamGetInt("PropFunction", 0); PLMCoefficient = ParamGetDouble("PLMCoefficient", 1.0); IgnoreSingleTermQuery = ParamGetInt("IgnoreSingleTermQuery", 0); } }; void GetAppParam() { RetrievalParameter::get(); SimpleKLParameter::get(); LocalParameter::get(); } //void PrintModel(const map& myLM, Index& ind) //{ // map::const_iterator mit; // for(mit = myLM.begin(); mit != myLM.end(); mit++) // { // cout << ind.term(mit->first) << "(" << mit->first << ")" << "\t" << mit->second << endl; // } //} /// Compute the "negative" KL-div of the query model and any document LM, where docLM has been smoothed double KLDivergence(const map& qLM, const map& docLM) { double div = 0; map::const_iterator mit; for(mit = qLM.begin(); mit != qLM.end(); mit++) { double pr = mit->second; int tid = mit->first; map::const_iterator iter = docLM.find(tid); // docLM has been smoothed, so there is no zero prob. double doc_prob = iter->second; div += pr * log (pr / doc_prob); } return -div; } /// Smooth a PLM using linear smoothing method /// i.e., LM = LM_mle * (1 - lambda) + LM_Col * lambda void JMSmoothing(const map& qLM, map& docLM, Index& ind, const double lambda) { map::const_iterator mit; for(mit = qLM.begin(); mit != qLM.end(); mit++) { int tid = mit->first; double col_prob = (double) (ind.termCount(tid)) / (double) (ind.termCount()); map::const_iterator iter = docLM.find(tid); double doc_prob = (iter == docLM.end()) ? lambda * col_prob : ((1 - lambda) * iter->second + lambda * col_prob); docLM[tid] = doc_prob; } } /// Smooth a PLM using Dirichlet prior smoothing method void DirSmoothing(const map& qLM, map& docLM, Index& ind, const double doc_len, const double mu) { double lambda = mu / (mu + doc_len); JMSmoothing(qLM, docLM, ind, lambda); } // Cumulative normal distribution function at x double GaussianCDF(double x, double mean, double sigma) { double res; x=(x - mean) / sigma; if (x == 0) { res=0.5; } else { double oor2pi = 1/(sqrt(double(2) * pi)); double t = 1 / (double(1) + 0.2316419 * fabs(x)); t *= oor2pi * exp(-0.5 * x * x) * (0.31938153 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429)))); if (x >= 0) { res = double(1) - t; } else { res = t; } } return res; } // Cumulative distribution function of Triangle Kernel double TriangleCDF(double x, double mean, double sigma) { double res; x = (x - mean) / sigma; if(x >= 1) { res = sigma; } else if(x < -1) { res = 0; } else if(x < 0) { res = sigma * (1 - fabs(x)) * (1 - fabs(x)) / 2.0; } else { res = sigma - sigma * (1 - x) * (1 - x) / 2.0; } return res; } // Cumulative distribution function of Cosine Kernel double CosineCDF(double x, double mean, double sigma) { double res; x = (x - mean) / sigma; if(x >= 1) { res = sigma; } else if(x < -1) { res = 0; } else if(x < 0) { res = sigma * (1 + x - sin(pi * x) / pi) / 2.0; } else { res = sigma - sigma * (1 - x + sin(pi * x) / pi) / 2.0; } //cout << res << endl; return res; } // Cumulative distribution function of Circle Kernel double CircleCDF(double x, double mean, double sigma) { double res; x = (x - mean) / sigma; if(x >= 1) { res = (pi - 2.0) * sigma; } else if(x < -1) { res = 0; } else if(x < 0) { res = sigma * (asin(x) + pi / 2.0 - sqrt(1 - x * x)); } else { res = (pi - 2.0) * sigma - sigma * (asin(-x) + pi / 2.0 - sqrt(1 - x * x)); } return res; } // Cumulative distribution function of Arc Kernel double ArcCDF(double x, double mean, double sigma) { double res; x = (x - mean) / sigma; if(x >= 1) { res = (pi - 1.0) * sigma / 2.0; } else if(x < -1) { res = 0; } else if(x < 0) { res = sigma * (asin(x) + pi / 2.0 - sqrt(1 - x * x) + (1 - fabs(x)) * (1 - fabs(x)) / 2.0) / 2.0; } else { res = (pi - 1.0) * sigma / 2.0 - sigma * (asin(-x) + pi / 2.0 - sqrt(1 - x * x) + (1 - fabs(x)) * (1 - fabs(x)) / 2.0) / 2.0; } return res; } /// The propagated count from one position to another, when the distance between two positions is "dis" /// Propagation function: -1 Passage; 0 Gaussian; 1: Cosine; 2: Triangle; 3: Arc; 4: Circle /// Here, "dis" is normalized double PropagationCount(double dis, int propFunction = 0) { dis = fabs(dis); switch (propFunction) { case -1: if(dis <= 1.0) return 1.0; else return 0; case 1: if(dis <= 1.0) return (1 + cos(pi * dis)) / 2.0; else return 0; case 2: if(dis <= 1.0) return 1 - dis; else return 0; case 3: if(dis <= 1.0) return (1 - dis + sqrt(1 - dis * dis)) / 2.0; else return 0; case 4: if(dis <= 1.0) return sqrt(1 - dis * dis); else return 0; case 0: default: return exp( - dis * dis / 2); } } /// The sum of propogated counts from all positions to the current position "pos", i.e., the size of the vitual passage /// Propagation function: -1 Passage; 0 Gaussian; 1: Cosine; 2: Triangle; 3: Arc; 4: Circle double PropagationCountSum (int pos, double doc_len, int propFunction = 0) { double sigma = LocalParameter::PLMSigma; double psg_len = 0; switch (propFunction) { case -1: if(sigma > doc_len - pos) { psg_len += doc_len - pos; } else { psg_len += sigma; } if(sigma > pos) { psg_len += pos; } else { psg_len += sigma; } break; case 1: psg_len = CosineCDF(doc_len, (double) pos, sigma) - CosineCDF(0, (double) pos, sigma); break; case 2: psg_len = TriangleCDF(doc_len, (double) pos, sigma) - TriangleCDF(0, (double) pos, sigma); break; case 3: psg_len = ArcCDF(doc_len, (double) pos, sigma) - ArcCDF(0, (double) pos, sigma); break; case 4: psg_len = CircleCDF(doc_len, (double) pos, sigma) - CircleCDF(0, (double) pos, sigma); break; case 0: default: psg_len = sqrt(double(2) * pi) * sigma * (GaussianCDF(doc_len, (double) pos, sigma) - GaussianCDF(0, (double) pos, sigma)); break; } return psg_len; } /// Compute the relevance of a document using PLM double PLMDocScoring(const map& qLM, const lemur::api::DOCID_T did, Index& ind) { // sorted query terms by position ascending IndexedRealVector qTermPosList; const TermInfoList* tlist = ind.termInfoListSeq(did); int pos = 0; tlist->startIteration(); while (tlist->hasMore()) { pos++; TermInfo* entry = tlist->nextEntry(); lemur::api::TERMID_T tid = entry->termID(); // if it is a query term if(qLM.find(tid) != qLM.end()) { qTermPosList.PushValue(tid, pos); //cout << ind.term(tid) << ":" << pos << "\n"; } } delete tlist; const double doc_len = ind.docLength(did); // relevance score by PLM double plmScore = -1000000; // to improve efficiency, we only score positions where a query term occur. // intuitively, the best matching position would be at least close to these positions. // this is different from our previous implementation for sigir'09 paper, but the performance // is similar for(int i = 0; i < qTermPosList.size(); i++) { const int center = (int) qTermPosList[i].val; // the total propagated counts to the current position "center" const double psg_len = PropagationCountSum(center, doc_len, LocalParameter::PropFunction); // positional language model at "center" map plm; for(int j = 0; j < qTermPosList.size(); j++) { const int pos = (int) qTermPosList[j].val; const int tid = qTermPosList[j].ind; // the (normalized) propagated count from a query term "tid" at position "pos" double pr = PropagationCount((double) (pos - center) / LocalParameter::PLMSigma, LocalParameter::PropFunction) / psg_len; if(pr > 0) { if(plm.find(tid) == plm.end()) { plm.insert(make_pair(tid, pr)); } else { plm[tid] += pr; } } } // smooth PLM if(LocalParameter::PLMSmoothMethod == 0) { JMSmoothing(qLM, plm, ind, LocalParameter::PLMJMLambda); } else // if(LocalParameter::PLMSmoothingStrategy == 1) { DirSmoothing(qLM, plm, ind, psg_len, LocalParameter::PLMDirPrior); } // KL-divergence double score = KLDivergence(qLM, plm); if(score > plmScore) { plmScore = score; } } return plmScore; } /// Rerank result documents using PLM void PLMReranking(const map& qLM, IndexedRealVector& retResults, Index& ind) { IndexedRealVector sortedList; IndexedRealVector::iterator dit; for(dit = retResults.begin(); dit != retResults.end(); dit++) { lemur::api::DOCID_T did = dit->ind; double score = dit->val * (1 - LocalParameter::PLMCoefficient) + PLMDocScoring(qLM, did, ind) * LocalParameter::PLMCoefficient; sortedList.PushValue(did, score); if(sortedList.size() >= LocalParameter::RerankedDocCount) { break; } } sortedList.Sort(); if(retResults.size() > LocalParameter::RerankedDocCount) { retResults.erase(retResults.begin(), retResults.begin() + LocalParameter::RerankedDocCount); } else { retResults.clear(); } retResults.insert(retResults.begin(), sortedList.begin(), sortedList.end()); //retResults.Sort(); } /// A retrieval evaluation program int AppMain(int argc, char *argv[]) { Index *ind; try { ind = IndexManager::openIndex(RetrievalParameter::databaseIndex); } catch (Exception &ex) { ex.writeMessage(); throw Exception("RelEval", "Can't open index, check parameter index"); } DocStream *qryStream; try { qryStream = new lemur::parse::BasicDocStream(RetrievalParameter::textQuerySet); } catch (Exception &ex) { ex.writeMessage(cerr); throw Exception("RetEval", "Can't open query file, check parameter textQuery"); } ofstream result(RetrievalParameter::resultFile.c_str()); ResultFile resFile(RetrievalParameter::TRECresultFileFormat); resFile.openForWrite(result, *ind); ifstream *workSetStr; ResultFile *docPool; if (RetrievalParameter::useWorkingSet) { workSetStr = new ifstream(RetrievalParameter::workSetFile.c_str(), ios::in); if (workSetStr->fail()) { throw Exception("RetEval", "can't open working set file"); } docPool = new ResultFile(false); // working set is always simple format docPool->openForRead(*workSetStr, *ind); } lemur::retrieval::ArrayAccumulator accumulator(ind->docCount()); IndexedRealVector results(ind->docCount()); RetrievalMethod *model = NULL; model = RetMethodManager::createModel(ind, &accumulator, RetrievalParameter::retModel); qryStream->startDocIteration(); TextQuery *q; IndexedRealVector workSetRes; bool ignoreWeights = true; bool doingRelModel = (RetrievalParameter::fbDocCount > 0 && RetrievalParameter::retModel == "kl" && (SimpleKLParameter::qryPrm.fbMethod == SimpleKLParameter::RM1 || SimpleKLParameter::qryPrm.fbMethod == SimpleKLParameter::RM2)); while (qryStream->hasMore()) { Document *d = qryStream->nextDoc(); q = new TextQuery(*d); cout << "query : "<< q->id() << endl; QueryRep * qr = model->computeQueryRep(*q); PseudoFBDocs *workSet; if (RetrievalParameter::useWorkingSet) { docPool->getResult(q->id(), workSetRes); workSet = new PseudoFBDocs(workSetRes, -1); // -1 means using all docs model->scoreDocSet(*qr,*workSet,results); } else { model->scoreCollection(*qr, results); } results.Sort(); /********** Added by Yuanhua: begin ************ Note: other part of the AppMain function is the same as the RetEval.cpp file provided by Lemur. ************************************************/ // push query information into a hash table "qLM" lemur::retrieval::SimpleKLQueryModel * klqr = (lemur::retrieval::SimpleKLQueryModel *) qr; map qLM; klqr->startIteration(); while(klqr->hasMore()) { QueryTerm *qt; qt = klqr->nextTerm(); lemur::api::TERMID_T tid = qt->id(); qLM.insert(make_pair(tid, qt->weight() / klqr->totalCount())); delete qt; } // do we apply PLM on single-term queries? if(LocalParameter::IgnoreSingleTermQuery == 1 && qLM.size() == 1) { } else { // rerank documents PLMReranking(qLM, results, *ind); } /********** Added by Yuanhua: end *************/ // prune to number of feedback docs. if (RetrievalParameter::fbDocCount > 0 && results.size() > RetrievalParameter::fbDocCount) results.erase(results.begin() + RetrievalParameter::fbDocCount, results.end()); if (doingRelModel) { if (SimpleKLParameter::qryPrm.adjScoreMethod != SimpleKLParameter::QUERYLIKELIHOOD) { throw Exception("RetEval:FB", "Relevance models require query likelihood scores."); } ignoreWeights = false; results.LogToPosterior(); } if (RetrievalParameter::retModel == "indri") ignoreWeights = false; if (RetrievalParameter::fbDocCount > 0) { PseudoFBDocs *topDoc = new PseudoFBDocs(results, RetrievalParameter::fbDocCount, ignoreWeights); model->updateQuery(*qr, *topDoc); if (RetrievalParameter::useWorkingSet) { model->scoreDocSet(*qr,*workSet,results); } else { model->scoreCollection(*qr, results); } results.Sort(); delete topDoc; } resFile.writeResults(q->id(), &results, RetrievalParameter::resultCount); if (RetrievalParameter::useWorkingSet) { delete workSet; } delete qr; delete q; } if (RetrievalParameter::useWorkingSet) { delete docPool; delete workSetStr; } delete model; result.close(); delete qryStream; delete ind; return 0; }