How to use the pyserini.collection function in pyserini

To help you get started, we’ve selected a few pyserini examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github castorini / pyserini / tests / test_collection.py View on Github external
def test_cacm(self):
        # We're going to append a random value to downloaded files:
        r = randint(0, 10000000)
        url = 'https://github.com/castorini/anserini/blob/master/src/main/resources/cacm/cacm.tar.gz?raw=true'
        tarball_name = 'cacm{}.tar.gz'.format(r)
        directory = 'collection{}/'.format(r)

        _, _ = urlretrieve(url, tarball_name)

        tarball = tarfile.open(tarball_name)
        tarball.extractall(directory)
        tarball.close()

        cacm = collection.Collection('HtmlCollection', directory)
        generator = index.Generator('DefaultLuceneDocumentGenerator')

        cnt = 0
        for (i, fs) in enumerate(cacm):
            for (j, doc) in enumerate(fs):
                self.assertTrue(isinstance(doc, collection.SourceDocument))
                self.assertTrue(doc.raw is not None)
                self.assertTrue(doc.raw != '')
                self.assertTrue('' in doc.raw)
                self.assertTrue(doc.contents is not None)
                self.assertTrue(doc.contents != '')
                self.assertTrue('' not in doc.contents)

                parsed = generator.create_document(doc)
                docid = parsed.get('id')            # FIELD_ID
                raw = parsed.get('raw')             # FIELD_RAW
github castorini / pyserini / tests / test_collection.py View on Github external
tarball_name = 'cacm{}.tar.gz'.format(r)
        directory = 'collection{}/'.format(r)

        _, _ = urlretrieve(url, tarball_name)

        tarball = tarfile.open(tarball_name)
        tarball.extractall(directory)
        tarball.close()

        cacm = collection.Collection('HtmlCollection', directory)
        generator = index.Generator('DefaultLuceneDocumentGenerator')

        cnt = 0
        for (i, fs) in enumerate(cacm):
            for (j, doc) in enumerate(fs):
                self.assertTrue(isinstance(doc, collection.SourceDocument))
                self.assertTrue(doc.raw is not None)
                self.assertTrue(doc.raw != '')
                self.assertTrue('' in doc.raw)
                self.assertTrue(doc.contents is not None)
                self.assertTrue(doc.contents != '')
                self.assertTrue('' not in doc.contents)

                parsed = generator.create_document(doc)
                docid = parsed.get('id')            # FIELD_ID
                raw = parsed.get('raw')             # FIELD_RAW
                contents = parsed.get('contents')   # FIELD_BODY
                self.assertTrue(docid != '')
                self.assertTrue(raw is not None)
                self.assertTrue(raw != '')
                self.assertTrue('html' in raw)
                self.assertTrue(contents is not None)
github castorini / pyserini / scripts / cord19 / extract_citation_graph.py View on Github external
def main(path):
    collection = pyserini.collection.Collection('Cord19AbstractCollection', path)
    articles = collection.__next__()

    with open("articles.csv", 'w') as article_csv, open("edges.csv", 'w') as edge_csv:
        article_csv = csv.writer(article_csv)
        edge_csv = csv.writer(edge_csv)
        article_csv.writerow(["cord_uid", "title", "pmcid"])
        edge_csv.writerow(["cord_uid", "target_title", "doi"])

        prev_titles = set()
        prev_cord_uid = set()
        for d in articles:
            article = pyserini.collection.Cord19Article(d.raw)
            title = article.title()
            cord_uid = article.cord_uid()
            if article.is_full_text() and title and title not in prev_titles \
                    and cord_uid not in prev_cord_uid:
github castorini / pyserini / scripts / cord19 / extract_citation_graph.py View on Github external
def main(path):
    collection = pyserini.collection.Collection('Cord19AbstractCollection', path)
    articles = collection.__next__()

    with open("articles.csv", 'w') as article_csv, open("edges.csv", 'w') as edge_csv:
        article_csv = csv.writer(article_csv)
        edge_csv = csv.writer(edge_csv)
        article_csv.writerow(["cord_uid", "title", "pmcid"])
        edge_csv.writerow(["cord_uid", "target_title", "doi"])

        prev_titles = set()
        prev_cord_uid = set()
        for d in articles:
            article = pyserini.collection.Cord19Article(d.raw)
            title = article.title()
            cord_uid = article.cord_uid()
            if article.is_full_text() and title and title not in prev_titles \
                    and cord_uid not in prev_cord_uid:
                article_data = [article.cord_uid(), escape_title(title),
                                article.json["paper_id"]]
                article_csv.writerow(article_data)
                prev_titles.add(title)
                prev_cord_uid.add(cord_uid)

                bib_entries = article.bib_entries()
                # Create edge between article and each cited title
                for bib_ref in bib_entries:
                    ref = bib_entries[bib_ref]
                    if ref['title']:
                        doi = ref['other_ids'].get('DOI')