Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
List of search results.
"""
hits = None
if query_generator:
hits = self.object.search(query_generator, JString(q), k)
elif isinstance(q, JQuery):
# Note that RM3 requires the notion of a query (string) to estimate the appropriate models. If we're just
# given a Lucene query, it's unclear what the "query" is for this estimation. One possibility is to extract
# all the query terms from the Lucene query, although this might yield unexpected behavior from the user's
# perspective. Until we think through what exactly is the "right thing to do", we'll raise an exception
# here explicitly.
if self.is_using_rm3():
raise NotImplementedError('RM3 incompatible with search using a Lucene query.')
hits = self.object.search(q, k)
else:
hits = self.object.search(JString(q.encode('utf8')), k)
docids = set()
filtered_hits = []
for hit in hits:
if strip_segment_id is True:
hit.docid = hit.docid.split('.')[0]
if hit.docid in docids:
continue
filtered_hits.append(hit)
if remove_dups is True:
docids.add(hit.docid)
term : str
Term.
analyzer : analyzer
Lucene analyzer to use, ``None`` if term is already analyzed.
k1 : float
BM25 k1 parameter.
b : float
BM25 b parameter.
Returns
-------
float
BM25 weight of the term in the document, or 0 if the term does not exist in the document.
"""
if analyzer is None:
return self.object.getBM25AnalyzedTermWeightWithParameters(self.reader, JString(docid),
JString(term.encode('utf-8')),
float(k1), float(b))
else:
return self.object.getBM25UnanalyzedTermWeightWithParameters(self.reader, JString(docid),
JString(term.encode('utf-8')), analyzer,
float(k1), float(b))
"""Return the :class:`Document` based on a ``field`` with ``id``. For example, this method can be used to fetch
document based on alternative primary keys that have been indexed, such as an article's DOI.
Parameters
----------
field : str
The field to look up.
q : str
The document's unique id.
Returns
-------
Optional[Document]
:class:`Document` whose ``field`` is ``id``.
"""
lucene_document = self.object.documentByField(self.reader, JString(field), JString(q))
if lucene_document is None:
return None
return Document(lucene_document)
----------
q : str
Query string.
f : str
Additional field to search.
boost : float
Weight boost for additional field.
k : int
Number of hits to return.
Returns
-------
List[JSimpleSearcherResult]
List of document hits returned from search
"""
return self.object.searchFields(JString(q), JString(f), float(boost), k)
Parameters
----------
term : str
Raw term.
analyzer : analyzer
Analyzer to apply. Defaults to Anserini's default.
Returns
-------
List[Posting]
List of :class:`Posting` objects corresponding to the postings list for the term.
"""
if analyzer is None:
postings_list = self.object.getPostingsListForAnalyzedTerm(self.reader, JString(term.encode('utf-8')))
else:
postings_list = self.object.getPostingsListWithAnalyzer(self.reader, JString(term.encode('utf-8')),
analyzer)
if postings_list is None:
return None
result = []
for posting in postings_list.toArray():
result.append(Posting(posting.getDocid(), posting.getTF(), posting.getPositions()))
return result
k1 : float
BM25 k1 parameter.
b : float
BM25 b parameter.
Returns
-------
float
BM25 weight of the term in the document, or 0 if the term does not exist in the document.
"""
if analyzer is None:
return self.object.getBM25AnalyzedTermWeightWithParameters(self.reader, JString(docid),
JString(term.encode('utf-8')),
float(k1), float(b))
else:
return self.object.getBM25UnanalyzedTermWeightWithParameters(self.reader, JString(docid),
JString(term.encode('utf-8')), analyzer,
float(k1), float(b))
Parameters
----------
term : str
Unanalyzed term.
analyzer : analyzer
Analyzer to apply.
Returns
-------
Tuple[int, int]
Document frequency and collection frequency.
"""
if analyzer is None:
analyzer = get_lucene_analyzer(stemming=False, stopwords=False)
term_map = self.object.getTermCountsWithAnalyzer(self.reader, JString(term.encode('utf-8')), analyzer)
return term_map.get(JString('docFreq')), term_map.get(JString('collectionFreq'))
def analyze(self, text: str) -> List[str]:
"""Analyze a piece of text.
Parameters
----------
text : str
Text to analyze.
Returns
-------
List[str]
List of tokens corresponding to the output of the analyzer.
"""
results = JAnalyzerUtils.analyze(self.analyzer, JString(text.encode('utf-8')))
tokens = []
for token in results.toArray():
tokens.append(token)
return tokens