Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main():
users_comments_dict = collections.defaultdict(list)
with tqdm(desc="Grouping comments by user", total=12704751) as progress_bar:
inside_comment = False
comment_text = None
comment_username = None
with open(COMMENTS_DATASET_FILE_PATH, 'rb') as file_:
# As the JSON file is large (2.5GB) and everything is in one line, is better to read it as a stream,
# using a SAX-like approach.
for prefix, type_, value in ijson.parse(file_):
if inside_comment:
if prefix.endswith('.text'):
comment_text = value
elif prefix.endswith('.author'):
comment_username = value
elif type_ == 'end_map': # This assumes there are no nested maps inside the comment maps.
if comment_text and comment_username and comment_text != 'nan' \
and comment_username != '[deleted]':
users_comments_dict[comment_username].append(comment_text)
inside_comment = False
comment_text = None
comment_username = None
progress_bar.update()
elif type_ == 'start_map' and prefix:
state = ParserState.START
dirs = []
key = None
obj = {}
argp = argparse.ArgumentParser()
argp.add_argument("file", type=argparse.FileType("rb"), help="ncdu export filename")
argp.add_argument("--dirs", choices=["array", "string"], default="string", help="directory name format output to flat file")
argp.add_argument("--verbose", action="store_true", help="enable verbose mode (inc. ijson variant)")
options = argp.parse_args()
if options.verbose:
sys.stderr.write("ijson module variant: {}\n".format(ijson.__name__))
parser = ijson.parse(options.file)
for prefix, event, value in parser:
if event == "start_array":
if state != ParserState.START:
# started non-header array (directory listing)
state = ParserState.ARRAY_START
else:
# started header, omit this map
state = ParserState.HEADER
elif event == "end_array":
# array means a (sub)directory so it was at least a second entry
# (first entry is the directory's meta-data)
state = ParserState.SUBSEQ_MAP
if dirs:
dirs.pop()
elif state == ParserState.ARRAY_START and event == "start_map":
# directory's meta-data
# prevent duplicate requests
if request in self.current_requests:
raise AutocompleteRequestError(
"Request denied: completion for \"{request}\" "
"already in progress.".format(request=request)
)
# start request
self.current_requests.add(request)
# get completion command
cmd = self.get_completion_cmd(view, text, offset)
# run completion command
p = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
parser = ijson.parse(p.stdout)
completions = list(self._parse_completions(parser, included=included))
# finish request
self.current_requests.discard(request)
return completions
def yield_obj(path, basepath):
with gzip.open(path, "r") as fin:
builder = ijson.common.ObjectBuilder()
for prefix, event, val in ijson.parse(fin):
try:
builder.event(event, val)
except:
if hasattr(builder, "value"):
print(builder.value)
if prefix == basepath and event == "end_map":
if hasattr(builder, "value"):
yield builder.value
builder = ijson.common.ObjectBuilder()
def initModel(self, model_path):
with open(model_path, 'rb') as model:
parser = ijson.parse(model)
for prefix, event, value in parser:
if (prefix, event) not in self._model_prefixes:
self._model_prefixes.append((prefix, event))