Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _get_data(self) -> list:
out_path_train = self.root/self.out_filename
if out_path_train.exists():
train = load_language_modeling(out_path_train)
dataset = train
else:
dataset = []
with open(self.root/self.dirname, 'r', encoding='utf-8') as jfile:
for item in tqdm(ijson.items(jfile, 'item')):
text = self._normalize(item['text']).strip()
samples = list(filter(lambda x: len(x) > 0, text.split('\n'))) # split document into sentences(len > 0)
dataset += samples
# If sample is a document, use below code not above two lines.
# sample = '\n'.join(list(filter(lambda x: len(x) > 0, text.split('\n'))))
# dataset.append(sample)
# Save dataset
(self.root/self.dirname).unlink()
save_language_modeling(dataset, to_path=out_path_train)
return dataset
"""
:return dicted js obj g_page_config:
:rtype :dict
"""
if self.detect_anti_spider_from_response(response):
logger.critical(anti_spider_breakpoit_msg)
raise CloseSpider(anti_spider_breakpoit_msg)
#sys.exit() #won't quit, just a exception
# this name is from taobao page: https://s.taobao.com/search?q=空调
g_page_config = ''
for line in response.body.split('\n'):
if 'g_page_config' in line:
g_page_config = line.split('{', 1)[1].rsplit('}', 1)[0]
break
js_obj_gen = ijson.items(StringIO(''.join(('{', g_page_config, '}'))), '')
js_obj = list(js_obj_gen)[0]
if self.detect_anti_spider_from_js_obj(js_obj, response):
logger.critical(anti_spider_breakpoit_msg)
raise CloseSpider(anti_spider_breakpoit_msg)
return js_obj
def read_json(location, result):
selector = result.value(api_vocab.selector)
if selector is not None:
selector = selector.value
else:
selector = ""
with get_content(location, result) as fo:
yield from enumerate(ijson.items(fo, selector))
def generator(self, instream):
from ijson import items
docs = items(instream, 'rows.item')
self.generator_index = -1
for doc in docs:
self.generator_index += 1
if self.docHandler != None and isinstance(self.docHandler, types.FunctionType):
yield self.docHandler(doc)
elif self.docHandler != None and isinstance(self.docHandler, CouchDBDocProcessor):
yield self.docHandler.process(doc)
else:
yield doc
def parse(self, instream):
from ijson import items
docs = items(instream, 'rows.item')
count = 0
for doc in docs:
count += 1
if self.docHandler != None and isinstance(self.docHandler, types.FunctionType):
self.docHandler(doc)
elif self.docHandler != None and isinstance(self.docHandler, CouchDBDocProcessor):
self.docHandler.process(doc)
log.debug("DOC: %s" %(json.dumps(doc)))
return count
... u"Using the hybrid virtual leader and behavioral approach schema, the formation " +
... u"control strategy by means of potential function is proposed. The overall strategy " +
... u"has been successfully applied to the Quadrotor's model of Parrot AR Drone 2.0 in " +
... u"Gazebo simulator programmed using Robot Operating System.\\nAuthor(s) Rizqi, A.A.A. " +
... u"Dept. of Electr. Eng. & Inf. Technol., Univ. Gadjah Mada, Yogyakarta, Indonesia " +
... u"Cahyadi, A.I. ; Adji, T.B.\\nReferenced Items are not available for this document.\\n" +
... u"No versions found for this document.\\nStandards Dictionary Terms are available to " +
... u"subscribers only.",
... u'uri': u'http://dig.isi.edu/autonomy/data/article/6871517',
... u'datePublished': u'2014',
... 'filename': '{}/test_data_large_json.json'.format(test_data_path)}
True
"""
with open(filename, 'r') as f:
for item in ijson.items(f, json_prefix):
if hasattr(item, 'keys'): # check if item is a dictionary
item['filename'] = filename
yield item
# check if item is both iterable and not a string
elif __is_iterable(item) and not isinstance(item, str):
for sub_item in item:
# check if sub_item is a dictionary
if hasattr(sub_item, 'keys'):
sub_item['filename'] = filename
yield sub_item
else:
raise ValueError("'item' in json source is not a dict, and is either a string or not iterable: %r" % item)
def do_import(args, db):
pdb = db()
friend_name = args.friend_name
friend = pdb.get_friend_by_name(friend_name)
if not friend:
print >> sys.stderr, "No friend by that name, check your spelling or create a new friend using add_friend"
return False
friend_id = friend['id']
print "Importing Authors"
with open(args.file_name) as import_file:
authors_to_insert = []
author_docs = ijson.items(import_file, 'authors.item')
for author_doc in author_docs:
authors_to_insert.append(author_doc)
if len(authors_to_insert) >= INSERT_BATCH_SIZE:
print "."
pdb.load_author_documents_from_friend(friend_id, authors_to_insert)
authors_to_insert = []
if authors_to_insert:
pdb.load_author_documents_from_friend(friend_id, authors_to_insert)
print "Importing Tomes"
with open(args.file_name) as import_file:
tomes_to_insert = []
tome_docs = ijson.items(import_file, 'tomes.item')
for tome_doc in tome_docs:
tomes_to_insert.append(tome_doc)
if len(tomes_to_insert) >= INSERT_BATCH_SIZE:
arg_parser.add_argument('--test', dest='test', action='store_true', default=False)
arg_parser.add_argument('--inputFile', type=str, nargs=1)
arg_parser.add_argument('--outputFile', type=str, nargs=1)
return arg_parser
if __name__ == "__main__":
args = make_arg_parser().parse_args()
test_mode = args.test
temp_file_name = tempfile.mkstemp(prefix="kg2-")[1]
input_file_name = args.inputFile[0]
nodes = []
if input_file_name.endswith('.gz'):
graph = gzip.GzipFile(input_file_name, 'r')
for node in items(graph, "nodes"):
nodes.append(node)
else:
with open(input_file_name, 'r') as graph:
for node in items(graph, "nodes"):
nodes.append(node)
nodes = nodes[0]
output_graph = {"nodes": nodes}
output_file_name = args.outputFile[0]
kg2_util.save_json(output_graph, output_file_name, test_mode)
def create_train_file(train_file, train_file_out, opt):
train_file_op = open(train_file_out, "w")
positive_samples_count = 0
negative_samples_count = 0
aug_samples_count = 0
train_data_handle = open(train_file, 'rb')
json_data = ijson.items(train_data_handle, 'item')
for index, entry in enumerate(json_data):
if opt.heuristic_data_augmentation > 0:
correct_answer_rows = get_aug_data(entry, min(opt.heuristic_data_augmentation + (opt.heuristic_data_augmentation * index - aug_samples_count), 9))
aug_samples_count += len(correct_answer_rows)
else:
correct_answer_rows = []
# row = str(index+1) + "\t"
context = get_context(entry)
row = context #+ "\t"
if len(entry['options-for-correct-answers']) == 0:
correct_answer = {}
correct_answer['utterance'] = "None"
target_id = "NONE"
else:
except: # If not create
es.snapshot.create_repository(repository=repository_name, body={"type": "fs", "settings": {"location": backup_directory}})
es.snapshot.create(repository=repository_name,snapshot=snapshot_name, body={"indices": index_name})
print("Snapshot created: " + snapshot_name)
print("See all snapshots with GET /_cat/snapshots/openartbrowser_index_backup")
print("If fallback to an old snapshot is required close the index with POST /" + index_name +"/_close")
print("After this apply the snapshot, this will reopen the index POST /_snapshot/openartbrowser_index_backup/" + snapshot_name + "/_restore")
else:
es.indices.create(index=index_name) # Create if index not exists
update_count = 0
creation_count = 0
delete_count = 0
# Document creation
for item in ijson.items(open(file, 'r', encoding='utf-8'), 'item'):
# Search the index by the qid and type (only qId is not unique!)
result = es.search(index=index_name, body={"query": {"bool": {"must": [{"match": {"id": item['id']}},{"match": {"type": item['type']}}]}}})
result_length = len(result['hits']['hits'])
if result_length == 1: # If exists update
elastic_search_id = result['hits']['hits'][0]['_id']
es.update(id=elastic_search_id, index=index_name, doc_type='data', body={ 'doc': item })
update_count += 1
elif result_length >= 2: # Remove current if it is a duplicate (sanity check should not occur)
es.delete(id=elastic_search_id, index=index_name, doc_type='data')
delete_count += 1
#raise RuntimeError("There is a duplicate document in the index following qId: " + item['id']) ToDo: Comment in if there are problems with duplicates
else:
es.create(id=uuid.uuid4(), index=index_name, doc_type='data', body=item)
creation_count += 1
end = time.time()