Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _process_json(filename):
if filename is None:
return None
cnt = 0
selected = 0
onlyArticles = True
PO_NAME = 'wikidata.po'
SAVE_INTERVAL = 1000
PROCESS_NOF_ENTRIES = 2 * 1000 * 1000
po_file = _create_empty_po_file()
with open(filename, 'r') as json_data:
value = ijson.items(json_data, 'item')
for item in value:
label = item.get('labels')
if label is None:
continue
item_id = item['id']
if onlyArticles is True:
if item_id is None or item_id.startswith("Q") is False:
continue
comment = u'Article {0}'.format(item_id)
en_label = label.get('en')
ca_label = label.get('ca')
if en_label is None or ca_label is None:
agency = Agency.objects.get(id=agency_id)
except Agency.DoesNotExist as e:
logger.exception(e)
raise e
# Get schema info (schema path, dataset_prefix)
schema_info = JSON_SCHEMAS.get(schema, None)
with transaction.atomic():
audit = Audit.objects.create(agency_id=agency_id, audit_type=Audit.DATA_CATALOG_VALIDATION)
try:
with closing(open_streaming_response('GET', agency.data_json_url)) as resp:
# Use the schema dataset_prefix to get an iterator for the items to be validated.
objects = ijson.items(resp.raw, schema_info.get('dataset_prefix', ''))
default_args = {'json_schema_name': schema, 'source_url': agency.data_json_url}
if audit:
default_args.update({'audit_id': audit.id})
# We're going to spin off async tas
tasks = []
for num, obj in enumerate(objects):
args = default_args.copy()
args.update({'json_object': obj, 'object_position': num})
task = validate_json_object.apply_async(args=(args,), countdown=(num % COUNTDOWN_MODULO))
tasks.append(task)
except Exception as e:
logger.exception(e)
def calculate_centroids(self):
if os.path.exists(self.centroids_file):
os.remove(self.centroids_file)
f = open(self.corpus_file, 'r')
objects = ijson.items(f, 'articles.item')
i = 0
idmap = {}
cent_array = []
for article in objects:
abstract_text = article["abstractText"]
abstract_id = article["pmid"]
text = article["title"] + " " + abstract_text
centroid = get_centroid_idf(text, self.emb, self.idf, self.stopwords, self.dim)
cent_array.append(np.array(centroid, dtype=np.float32))
idmap[i] = abstract_id
i += 1
final_cent_array = np.array(cent_array, dtype=np.float32).reshape((i, self.dim))
print final_cent_array.shape
def initialize(self):
f = open(self.ret_file, 'r')
data_q = json.load(f)
abstracts_needed = set()
for i in range(len(data_q["questions"])):
abstracts_needed = abstracts_needed | set(data_q["questions"][i]["retrieved"])
f.close()
print "Collecting Abstracts.."
f = open(self.corpus_file, 'r')
corpus = ijson.items(f, 'articles.item')
for article in corpus:
pmid = article["pmid"]
if pmid in abstracts_needed:
self.corpus_index[pmid] = article["title"] + ' ' + article["abstractText"]
abstracts_needed.remove(pmid)
if not abstracts_needed:
break
f.close()
print len(self.corpus_index)
q_array_q = []
q_array_d = []
q_array_max = []
print "Reranking.."
n_questions = len(data_q["questions"])