Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_analyze_with_analyzer(self):
analyzer = analysis.get_lucene_analyzer(stemming=False)
self.assertTrue(isinstance(analyzer, JAnalyzer))
query = JString('information retrieval')
only_tokenization = JAnalyzerUtils.analyze(analyzer, query)
token_list = []
for token in only_tokenization.toArray():
token_list.append(token)
self.assertEqual(token_list, ['information', 'retrieval'])
def analyze(self, text: str, analyzer=None) -> List[str]:
"""Analyze a piece of text. Applies Anserini's default Lucene analyzer if analyzer not specified.
Parameters
----------
text : str
Text to analyze.
analyzer : analyzer
Analyzer to apply.
Returns
-------
List[str]
List of tokens corresponding to the output of the analyzer.
"""
if analyzer is None:
results = JAnalyzerUtils.analyze(JString(text.encode('utf-8')))
else:
results = JAnalyzerUtils.analyze(analyzer, JString(text.encode('utf-8')))
tokens = []
for token in results.toArray():
tokens.append(token)
return tokens
Parameters
----------
text : str
Text to analyze.
analyzer : analyzer
Analyzer to apply.
Returns
-------
List[str]
List of tokens corresponding to the output of the analyzer.
"""
if analyzer is None:
results = JAnalyzerUtils.analyze(JString(text.encode('utf-8')))
else:
results = JAnalyzerUtils.analyze(analyzer, JString(text.encode('utf-8')))
tokens = []
for token in results.toArray():
tokens.append(token)
return tokens