Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_cacm(self):
# We're going to append a random value to downloaded files:
r = randint(0, 10000000)
url = 'https://github.com/castorini/anserini/blob/master/src/main/resources/cacm/cacm.tar.gz?raw=true'
tarball_name = 'cacm{}.tar.gz'.format(r)
directory = 'collection{}/'.format(r)
_, _ = urlretrieve(url, tarball_name)
tarball = tarfile.open(tarball_name)
tarball.extractall(directory)
tarball.close()
cacm = collection.Collection('HtmlCollection', directory)
generator = index.Generator('DefaultLuceneDocumentGenerator')
cnt = 0
for (i, fs) in enumerate(cacm):
for (j, doc) in enumerate(fs):
self.assertTrue(isinstance(doc, collection.SourceDocument))
self.assertTrue(doc.raw is not None)
self.assertTrue(doc.raw != '')
self.assertTrue('' in doc.raw)
self.assertTrue(doc.contents is not None)
self.assertTrue(doc.contents != '')
self.assertTrue('' not in doc.contents)
parsed = generator.create_document(doc)
docid = parsed.get('id') # FIELD_ID
raw = parsed.get('raw') # FIELD_RAW
tarball_name = 'cacm{}.tar.gz'.format(r)
directory = 'collection{}/'.format(r)
_, _ = urlretrieve(url, tarball_name)
tarball = tarfile.open(tarball_name)
tarball.extractall(directory)
tarball.close()
cacm = collection.Collection('HtmlCollection', directory)
generator = index.Generator('DefaultLuceneDocumentGenerator')
cnt = 0
for (i, fs) in enumerate(cacm):
for (j, doc) in enumerate(fs):
self.assertTrue(isinstance(doc, collection.SourceDocument))
self.assertTrue(doc.raw is not None)
self.assertTrue(doc.raw != '')
self.assertTrue('' in doc.raw)
self.assertTrue(doc.contents is not None)
self.assertTrue(doc.contents != '')
self.assertTrue('' not in doc.contents)
parsed = generator.create_document(doc)
docid = parsed.get('id') # FIELD_ID
raw = parsed.get('raw') # FIELD_RAW
contents = parsed.get('contents') # FIELD_BODY
self.assertTrue(docid != '')
self.assertTrue(raw is not None)
self.assertTrue(raw != '')
self.assertTrue('html' in raw)
self.assertTrue(contents is not None)
def main(path):
collection = pyserini.collection.Collection('Cord19AbstractCollection', path)
articles = collection.__next__()
with open("articles.csv", 'w') as article_csv, open("edges.csv", 'w') as edge_csv:
article_csv = csv.writer(article_csv)
edge_csv = csv.writer(edge_csv)
article_csv.writerow(["cord_uid", "title", "pmcid"])
edge_csv.writerow(["cord_uid", "target_title", "doi"])
prev_titles = set()
prev_cord_uid = set()
for d in articles:
article = pyserini.collection.Cord19Article(d.raw)
title = article.title()
cord_uid = article.cord_uid()
if article.is_full_text() and title and title not in prev_titles \
and cord_uid not in prev_cord_uid:
def main(path):
collection = pyserini.collection.Collection('Cord19AbstractCollection', path)
articles = collection.__next__()
with open("articles.csv", 'w') as article_csv, open("edges.csv", 'w') as edge_csv:
article_csv = csv.writer(article_csv)
edge_csv = csv.writer(edge_csv)
article_csv.writerow(["cord_uid", "title", "pmcid"])
edge_csv.writerow(["cord_uid", "target_title", "doi"])
prev_titles = set()
prev_cord_uid = set()
for d in articles:
article = pyserini.collection.Cord19Article(d.raw)
title = article.title()
cord_uid = article.cord_uid()
if article.is_full_text() and title and title not in prev_titles \
and cord_uid not in prev_cord_uid:
article_data = [article.cord_uid(), escape_title(title),
article.json["paper_id"]]
article_csv.writerow(article_data)
prev_titles.add(title)
prev_cord_uid.add(cord_uid)
bib_entries = article.bib_entries()
# Create edge between article and each cited title
for bib_ref in bib_entries:
ref = bib_entries[bib_ref]
if ref['title']:
doi = ref['other_ids'].get('DOI')