Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
die('did not find C&C parser at CANDC environmental variable.')
CANDC_DIR = Path(candc_dir).resolve()
GENERATE = CANDC_DIR / 'bin' / 'generate'
MARKEDUP = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats' / 'markedup'
CATS = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats'
if not GENERATE.exists():
logger.error('Currently the evalution script requires C&C parser compiled from its source.')
die('expected: $CANDC/bin/generate')
elif not MARKEDUP.exists() or not CATS.exists:
logger.error('The C&C directory is not configured expectedly.')
die('expected: $CANDC/src/data/ccg/cats/markedup')
tmp = tempfile.mktemp()
print(tmp)
with open(tmp, 'w') as f:
for _, tokens, tree in read_auto(auto_file):
print(tree.auto_flat(tokens=tokens), file=f)
command = f'{GENERATE} -j {CATS} {MARKEDUP} {tmp}'
proc = subprocess.Popen(command,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
results, error = proc.communicate()
if len(error.decode('utf-8')) > 0:
die(f'caught error in running $CANDC/bin/generate: {error.decode("utf-8")}')
lines = iter(results.decode('utf-8').split('\n'))
deps, udeps = set(), set()
rule_ids = {}
line = next(lines)
def create_traindata(args):
self = TrainingDataCreator(args.PATH,
args.word_freq_cut,
args.cat_freq_cut,
args.afix_freq_cut)
trees = [tree for _, _, tree in read_auto(self.filepath) if tree.word != 'FAILED']
logger.info(f'loaded {len(trees)} trees')
for tree in trees:
self._traverse(tree)
self._create_samples(trees)
cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut}
self._write(cats, args.OUT / 'target.txt')
words = {k: v for k, v in self.words.items() if v >= self.word_freq_cut}
self._write(words, args.OUT / 'words.txt')
suffixes = {k: v for k, v in self.suffixes.items() if v >= self.afix_freq_cut}
self._write(suffixes, args.OUT / 'suffixes.txt')
prefixes = {k: v for k, v in self.prefixes.items() if v >= self.afix_freq_cut}
self._write(prefixes, args.OUT / 'prefixes.txt')
def convert_json(autopath):
self = TrainingDataCreator(autopath, None, None, None)
trees = [tree for _, _, tree in read_auto(self.filepath) if tree.word != 'FAILED']
logger.info(f'loaded {len(trees)} trees')
self._create_samples(trees)
return self.samples
def create_testdata(args):
self = TrainingDataCreator(args.PATH,
args.word_freq_cut,
args.cat_freq_cut,
args.afix_freq_cut)
trees = [tree for _, _, tree in read_auto(self.filepath)]
self._create_samples(trees)
with open(args.OUT / 'testdata.json', 'w') as f:
logger.info(f'writing to {f.name}')
json.dump(self.samples, f)
with open(args.OUT / 'testsents.txt', 'w') as f:
logger.info(f'writing to {f.name}')
for sent in self.sents:
print(sent, file=f)
with open(args.OUT / 'testsents.conll', 'w') as f:
logger.info(f'writing to {f.name}')
self._to_conll(f)