Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#
"""
This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``SimpleSearcher``
class, which wraps the Java class with the same name in Anserini.
"""
import logging
from ..pyclass import autoclass, JPaths
logger = logging.getLogger(__name__)
# Wrappers around Lucene classes
JQuery = autoclass('org.apache.lucene.search.Query')
JDocument = autoclass('org.apache.lucene.document.Document')
# Wrappers around Anserini classes
JTopicReader = autoclass('io.anserini.search.topicreader.TopicReader')
JTopics = autoclass('io.anserini.search.topicreader.Topics')
JQueryGenerator = autoclass('io.anserini.search.query.QueryGenerator')
JBagOfWordsQueryGenerator = autoclass('io.anserini.search.query.BagOfWordsQueryGenerator')
JCovid19QueryGenerator = autoclass('io.anserini.search.query.Covid19QueryGenerator')
class Document:
"""Wrapper class for a Lucene ``Document``.
Parameters
----------
document : JDocument
def JArgs():
args = autoclass('io.anserini.index.IndexArgs')()
args.storeContents = True
args.storeRaw = True
args.dryRun = True ## So that indexing will be skipped
return args
def bm25(k1=0.9, b=0.4):
return autoclass('org.apache.lucene.search.similarities.BM25Similarity')(k1, b)
from ..pyclass import autoclass, JString
# Wrappers around Lucene classes
JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer')
JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer')
JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer')
JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer')
JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer')
JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer')
JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer')
JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer')
JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet')
# Wrappers around Anserini classes
JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils')
JFreebaseAnalyzer = autoclass('io.anserini.analysis.FreebaseAnalyzer')
JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer')
def get_lucene_analyzer(name='english', stemming=True, stemmer='porter', stopwords=True) -> JAnalyzer:
"""Create a Lucene ``Analyzer`` with specific settings.
Parameters
----------
name : str
Name of analyzer.
stemming : bool
Set to stem.
stemmer : str
Stemmer to use.
stopwords : bool
args = autoclass('io.anserini.index.IndexArgs')()
args.storeContents = True
args.storeRaw = True
args.dryRun = True ## So that indexing will be skipped
return args
def JCounters():
IndexCollection = autoclass('io.anserini.index.IndexCollection')
Counters = autoclass('io.anserini.index.IndexCollection$Counters')
return Counters(IndexCollection)
class JGenerators(Enum):
DefaultLuceneDocumentGenerator = autoclass('io.anserini.index.generator.DefaultLuceneDocumentGenerator')
TweetGenerator = autoclass('io.anserini.index.generator.TweetGenerator')
WapoGenerator = autoclass('io.anserini.index.generator.WashingtonPostGenerator')
class Generator:
"""Wrapper class for Anserini's generators.
Parameters
----------
generator_class : str
Name of generator class to instantiate
"""
def __init__(self, generator_class):
self.counters = JIndexHelpers.JCounters()
self.args = JIndexHelpers.JArgs()
self.generator_class = generator_class
self.object = self._get_generator()
logger = logging.getLogger(__name__)
JFileSegment = autoclass('io.anserini.collection.FileSegment')
JSourceDocument = autoclass('io.anserini.collection.SourceDocument')
class JCollections(Enum):
CarCollection = autoclass('io.anserini.collection.CarCollection')
Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
JsonCollection = autoclass('io.anserini.collection.JsonCollection')
NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
TrecCollection = autoclass('io.anserini.collection.TrecCollection')
TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
TweetCollection = autoclass('io.anserini.collection.TweetCollection')
WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')
class Collection:
"""
Iterable wrapper class for Anserini's DocumentCollection.
Parameters
----------
collection_class : str
Name of collection class to instantiate
collection_path : str
Path to directory containing collection
"""
This module provides Pyserini's Python interface query building for Anserini.
"""
import logging
from enum import Enum
from ..analysis import get_lucene_analyzer, Analyzer
from ..pyclass import autoclass
logger = logging.getLogger(__name__)
# Wrapper around Lucene clases
JTerm = autoclass('org.apache.lucene.index.Term')
JBooleanClause = autoclass('org.apache.lucene.search.BooleanClause')
JBoostQuery = autoclass('org.apache.lucene.search.BoostQuery')
JTermQuery = autoclass('org.apache.lucene.search.TermQuery')
# Wrappers around Anserini classes
JQueryGeneratorUtils = autoclass('io.anserini.search.query.QueryGeneratorUtils')
class JBooleanClauseOccur(Enum):
should = JQueryGeneratorUtils.getBooleanClauseShould()
must = JQueryGeneratorUtils.getBooleanClauseMust()
must_not = JQueryGeneratorUtils.getBooleanClauseMustNot()
filter = JQueryGeneratorUtils.getBooleanClauseFilter()
def get_boolean_query_builder():
"""Get a BooleanQueryBuilder object.
def qld(mu=1000):
return autoclass('org.apache.lucene.search.similarities.LMDirichletSimilarity')(mu)
JSourceDocument = autoclass('io.anserini.collection.SourceDocument')
class JCollections(Enum):
CarCollection = autoclass('io.anserini.collection.CarCollection')
Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
JsonCollection = autoclass('io.anserini.collection.JsonCollection')
NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
TrecCollection = autoclass('io.anserini.collection.TrecCollection')
TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
TweetCollection = autoclass('io.anserini.collection.TweetCollection')
WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')
class Collection:
"""
Iterable wrapper class for Anserini's DocumentCollection.
Parameters
----------
collection_class : str
Name of collection class to instantiate
collection_path : str
Path to directory containing collection
"""
def __init__(self, collection_class, collection_path):
self.counters = Counters()
and methods provided are meant only to provide tools for examining an index and are not optimized for computing over.
"""
import logging
from enum import Enum
from typing import Dict, Iterator, List, Optional, Tuple
from ..analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils
from ..pyclass import autoclass, JString
from ..search import Document
logger = logging.getLogger(__name__)
# Wrappers around Anserini classes
JIndexReader = autoclass('io.anserini.index.IndexReaderUtils')
class JIndexHelpers:
def JArgs():
args = autoclass('io.anserini.index.IndexArgs')()
args.storeContents = True
args.storeRaw = True
args.dryRun = True ## So that indexing will be skipped
return args
def JCounters():
IndexCollection = autoclass('io.anserini.index.IndexCollection')
Counters = autoclass('io.anserini.index.IndexCollection$Counters')
return Counters(IndexCollection)