How to use the depccg.tools.reader.read_auto function in depccg

To help you get started, we’ve selected a few depccg examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github masashi-y / depccg / depccg / tools / evaluate.py View on Github external
die('did not find C&C parser at CANDC environmental variable.')
    CANDC_DIR = Path(candc_dir).resolve()
    GENERATE = CANDC_DIR / 'bin' / 'generate'
    MARKEDUP = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats' / 'markedup'
    CATS = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats'
    if not GENERATE.exists():
        logger.error('Currently the evalution script requires C&C parser compiled from its source.')
        die('expected: $CANDC/bin/generate')
    elif not MARKEDUP.exists() or not CATS.exists:
        logger.error('The C&C directory is not configured expectedly.')
        die('expected: $CANDC/src/data/ccg/cats/markedup')

    tmp = tempfile.mktemp()
    print(tmp)
    with open(tmp, 'w') as f:
        for _, tokens, tree in read_auto(auto_file):
            print(tree.auto_flat(tokens=tokens), file=f)

    command = f'{GENERATE} -j {CATS} {MARKEDUP} {tmp}'
    proc = subprocess.Popen(command,
                            shell=True,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    results, error = proc.communicate()
    if len(error.decode('utf-8')) > 0:
        die(f'caught error in running $CANDC/bin/generate: {error.decode("utf-8")}')

    lines = iter(results.decode('utf-8').split('\n'))
    deps, udeps = set(), set()
    rule_ids = {}
    line = next(lines)
github masashi-y / depccg / depccg / tools / data.py View on Github external
def create_traindata(args):
        self = TrainingDataCreator(args.PATH,
                                   args.word_freq_cut,
                                   args.cat_freq_cut,
                                   args.afix_freq_cut)

        trees = [tree for _, _, tree in read_auto(self.filepath) if tree.word != 'FAILED']
        logger.info(f'loaded {len(trees)} trees')
        for tree in trees:
            self._traverse(tree)
        self._create_samples(trees)

        cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut}
        self._write(cats, args.OUT / 'target.txt')

        words = {k: v for k, v in self.words.items() if v >= self.word_freq_cut}
        self._write(words, args.OUT / 'words.txt')

        suffixes = {k: v for k, v in self.suffixes.items() if v >= self.afix_freq_cut}
        self._write(suffixes, args.OUT / 'suffixes.txt')

        prefixes = {k: v for k, v in self.prefixes.items() if v >= self.afix_freq_cut}
        self._write(prefixes, args.OUT / 'prefixes.txt')
github masashi-y / depccg / depccg / tools / data.py View on Github external
def convert_json(autopath):
        self = TrainingDataCreator(autopath, None, None, None)
        trees = [tree for _, _, tree in read_auto(self.filepath) if tree.word != 'FAILED']
        logger.info(f'loaded {len(trees)} trees')
        self._create_samples(trees)
        return self.samples
github masashi-y / depccg / depccg / tools / data.py View on Github external
def create_testdata(args):
        self = TrainingDataCreator(args.PATH,
                                   args.word_freq_cut,
                                   args.cat_freq_cut,
                                   args.afix_freq_cut)

        trees = [tree for _, _, tree in read_auto(self.filepath)]
        self._create_samples(trees)

        with open(args.OUT / 'testdata.json', 'w') as f:
            logger.info(f'writing to {f.name}')
            json.dump(self.samples, f)

        with open(args.OUT / 'testsents.txt', 'w') as f:
            logger.info(f'writing to {f.name}')
            for sent in self.sents:
                print(sent, file=f)

        with open(args.OUT / 'testsents.conll', 'w') as f:
            logger.info(f'writing to {f.name}')
            self._to_conll(f)