Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_analysis(self):
# Default is Porter stemmer
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer())
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])
# Specify Porter stemmer explicitly
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemmer='porter'))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])
# Specify Krovetz stemmer explicitly
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemmer='krovetz'))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])
# No stemming
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])
# No stopword filter, no stemming
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False, stopwords=False))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time'])
def test_analyze(self):
self.assertEqual(' '.join(self.index_reader.analyze('retrieval')), 'retriev')
self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy')),
'rapid retriev space economi')
tokenizer = analysis.get_lucene_analyzer(stemming=False)
self.assertEqual(' '.join(self.index_reader.analyze('retrieval', analyzer=tokenizer)), 'retrieval')
self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy', analyzer=tokenizer)),
'rapid retrieval space economy')
# Test utf encoding:
self.assertEqual(self.index_reader.analyze('zoölogy')[0], 'zoölog')
self.assertEqual(self.index_reader.analyze('zoölogy', analyzer=tokenizer)[0], 'zoölogy')
def test_different_analyzers_are_different(self):
self.searcher.set_analyzer(analysis.get_lucene_analyzer(stemming=False))
hits_first = self.searcher.search('information retrieval')
self.searcher.set_analyzer(analysis.get_lucene_analyzer())
hits_second = self.searcher.search('information retrieval')
self.assertNotEqual(hits_first, hits_second)
def test_different_analyzers_are_different(self):
self.searcher.set_analyzer(analysis.get_lucene_analyzer(stemming=False))
hits_first = self.searcher.search('information retrieval')
self.searcher.set_analyzer(analysis.get_lucene_analyzer())
hits_second = self.searcher.search('information retrieval')
self.assertNotEqual(hits_first, hits_second)
self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])
# No stemming
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])
# No stopword filter, no stemming
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False, stopwords=False))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time'])
# No stopword filter, with stemming
analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=True, stopwords=False))
self.assertTrue(isinstance(analyzer, Analyzer))
tokens = analyzer.analyze('City buses are running on time.')
self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
``Analyzer`` if analyzer is not specified.
Parameters
----------
term : str
Unanalyzed term.
analyzer : analyzer
Analyzer to apply.
Returns
-------
Tuple[int, int]
Document frequency and collection frequency.
"""
if analyzer is None:
analyzer = get_lucene_analyzer(stemming=False, stopwords=False)
term_map = self.object.getTermCountsWithAnalyzer(self.reader, JString(term.encode('utf-8')), analyzer)
return term_map.get(JString('docFreq')), term_map.get(JString('collectionFreq'))
def compute_bm25_term_weight(self, docid: str, term: str, analyzer=get_lucene_analyzer(), k1=0.9, b=0.4) -> float:
"""Compute the BM25 weight of a term in a document. Specify ``analyzer=None`` for an already analyzed term,
e.g., from the output of :func:`get_document_vector`.
Parameters
----------
docid : str
Collection ``docid``.
term : str
Term.
analyzer : analyzer
Lucene analyzer to use, ``None`` if term is already analyzed.
k1 : float
BM25 k1 parameter.
b : float
BM25 b parameter.
def get_postings_list(self, term: str, analyzer=get_lucene_analyzer()) -> List[Posting]:
"""Return the postings list for a term.
Parameters
----------
term : str
Raw term.
analyzer : analyzer
Analyzer to apply. Defaults to Anserini's default.
Returns
-------
List[Posting]
List of :class:`Posting` objects corresponding to the postings list for the term.
"""
if analyzer is None:
postings_list = self.object.getPostingsListForAnalyzedTerm(self.reader, JString(term.encode('utf-8')))
def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()):
"""Searches the collection.
Parameters
----------
term : str
The query term string.
field : str
Field to search.
analyzer : Analyzer
Analyzer to use for tokenizing the query term.
Returns
-------
JTermQuery
"""
analyzer = Analyzer(analyzer)