How to use the genie-toolkit.DatasetStringifier function in genie-toolkit

To help you get started, we’ve selected a few genie-toolkit examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github stanford-oval / almond-cloud / training / synthetic-gen-process.js View on Github external
locale: args.locale,
        thingpedia: './thingpedia.tt',
        entities: './entities.json',
        dataset: './dataset.tt',
        flags: args.flags,
        template: 'contextual.genie',
        random_seed: 'almond is awesome',
        maxDepth: args.maxdepth,
        targetPruningSize: args.target_pruning_size,
        debug: false, // no debugging, ever, because debugging also goes to stdout
    };

    inputFile
        .pipe(Genie.parallelize(PARALLEL_GENERATION,
            require.resolve('./workers/generate-contextual-worker.js'), options))
        .pipe(new Genie.DatasetStringifier())
        .pipe(process.stdout);

    await StreamUtils.waitFinish(process.stdout);

    process.disconnect();
}
github stanford-oval / almond-cloud / training / synthetic-gen-process.js View on Github external
templateFile: 'index.genie',

        rng: rng,
        locale: args.locale,
        flags: args.flags || {},
        maxDepth: args.maxdepth,
        targetPruningSize: args.target_pruning_size,
        debug: false, // no debugging, ever, because debugging also goes to stdout
    };

    const generator = new Genie.BasicSentenceGenerator(options);
    generator.on('progress', (value) => {
        process.send({ cmd:'progress', v: value });
    });
    const stringifier = new Genie.DatasetStringifier();

    generator.pipe(stringifier).pipe(process.stdout);
    await StreamUtils.waitFinish(process.stdout);

    process.disconnect();
}
github stanford-oval / almond-cloud / training / tasks / prepare-training-set.js View on Github external
const contextualParaphrase = this._downloadParaphrase(true)
                .pipe(new TypecheckStream(this._schemas));

            const basicSource = StreamUtils.chain([basicParaphrase, basicSynthetic], { objectMode: true });

            // Spool the basic (non-contextual, not augmented) dataset to disk
            // We need to do this because:
            // 1) We don't want to run to many generation/processing steps as a pipeline, because that
            //    would use too much memory
            // 2) We need to do multiple passes over the basic dataset for different reasons, and
            //    we can't cache it in memory
            const { path: basicDataset, fd: basicDatasetFD } =
                await tmp.file({ mode: 0o600, dir: '/var/tmp' });

            await StreamUtils.waitFinish(basicSource
                .pipe(new Genie.DatasetStringifier())
                .pipe(fs.createWriteStream(basicDataset, { fd: basicDatasetFD })));
            // basicDatasetFD is closed here

            let contexts = await
                fs.createReadStream(basicDataset, { encoding: 'utf8' })
                .pipe(byline())
                .pipe(new Genie.DatasetParser({ contextual: false }))
                .pipe(new Genie.ContextExtractor(this._schemas))
                .read();

            const contextualized =
                fs.createReadStream(basicDataset, { encoding: 'utf8' })
                .pipe(byline())
                .pipe(new Genie.DatasetParser({ contextual: false }))
                .pipe(new Genie.Contextualizer(contexts, {
                    locale: this._language,
github stanford-oval / almond-cloud / training / download-dataset.js View on Github external
query = dbClient.query(`select id,flags,preprocessed,target_code from example_utterances
            where language = ? and not find_in_set('obsolete',flags)
            and target_code<>'' and preprocessed<>'' and type in (?)
            order by id asc`,
            [language, types]);
    } else {
        query = dbClient.query(`select id,flags,preprocessed,target_code from example_utterances
            where language = ? and not find_in_set('obsolete',flags)
            and target_code<>'' and preprocessed<>''
            order by id asc`,
            [language]);
    }
    if (argv.test)
        argv.eval_prob *= 2;

    const writer = new Genie.DatasetStringifier();
    writer.pipe(argv.output);

    query.on('result', (row) => {
        row.flags = parseFlags(row.flags);
        row.flags.replaced = false;
        writer.write(row);
    });
    query.on('end', () => {
        writer.end();
        dbDone();
    });

    await StreamUtils.waitFinish(argv.output);
    await db.tearDown();
}
github stanford-oval / almond-cloud / training / tasks / prepare-training-set.js View on Github external
untypedStringProbability: 0,
            maxSpanLength: MAX_SPAN_LENGTH,
            ppdbProbabilitySynthetic: this._options.ppdbProbabilitySynthetic,
            ppdbProbabilityParaphrase: this._options.ppdbProbabilityParaphrase,
            syntheticExpandFactor: 1,
            paraphrasingExpandFactor: 30,
            noQuoteExpandFactor: 10,

            ppdbFile: ppdb,

            locale: this._language,
            rng: this._rng,
            debug: this._options.debug,
        });

        const train = new Genie.DatasetStringifier();
        const eval_ = new Genie.DatasetStringifier();
        const promises = [];
        promises.push(StreamUtils.waitFinish(train.pipe(this._options.train)));
        promises.push(StreamUtils.waitFinish(eval_.pipe(this._options.eval)));

        const splitter = new Genie.DatasetSplitter({
            rng: this._rng,
            locale: this._language,

            train,
            eval: eval_,

            evalProbability: this._options.evalProbability,
            forDevices: this._forDevices,
            splitStrategy: this._options.splitStrategy,
            useEvalFlag: true
github stanford-oval / almond-cloud / scripts / manual_train.js View on Github external
help: `Start from the nth line of the input tsv file.`
    });
    argparser.addArgument(['-l', '--locale'], {
        required: false,
        defaultValue: 'en-US',
        help: `BGP 47 locale tag of the natural language being processed (defaults to en-US).`
    });
    argparser.addArgument('--thingpedia', {
        required: false,
        defaultValue: 'thingpedia.json',
        help: `The path to the thingpedia.json file.`
    });
    const args = argparser.parseArgs();


    const learned = new Genie.DatasetStringifier();
    learned.pipe(fs.createWriteStream(args.learned, { flags: (args.offset > 0 ? 'a' : 'w') }));
    const dropped = fs.createWriteStream(args.dropped, { flags: (args.offset > 0 ? 'a' : 'w') });

    let lines = [];
    args.input.setEncoding('utf8');
    const input = args.input.pipe(csv.parse({ columns: true, relax: true, delimiter: '\t' }));
    input.on('data', (line) => {
        lines.push(line);
    });
    await waitEnd(input);

    if (args.offset > 1)
        lines = lines.slice(args.offset-1);

    const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
    rl.setPrompt('$ ');