How to use the srsly.write_json function in srsly

To help you get started, we’ve selected a few srsly examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / spaCy / tests / test_gold.py View on Github external
def test_roundtrip_docs_to_json():
    text = "I flew to Silicon Valley via London."
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    nlp = English()
    doc = nlp(text)
    doc.cats = cats
    doc[0].is_sent_start = True
    for i in range(1, len(doc)):
        doc[i].is_sent_start = False

    with make_tempdir() as tmpdir:
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(json_file), str(json_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]
github explosion / spaCy / spacy / cli / train.py View on Github external
def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
    best_dest = output_path / "model-best"
    shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
    for component, best_component_src in bests.items():
        shutil.rmtree(path2str(best_dest / component))
        shutil.copytree(
            path2str(best_component_src / component), path2str(best_dest / component)
        )
        accs = srsly.read_json(best_component_src / "accuracy.json")
        for metric in _get_metrics(component):
            meta["accuracy"][metric] = accs[metric]
    srsly.write_json(best_dest / "meta.json", meta)
    return best_dest
github explosion / spaCy / spacy / cli / convert.py View on Github external
# Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good(
            "Generated output file ({} documents): {}".format(len(data), output_file)
        )
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)
github explosion / spaCy / spacy / cli / pretrain.py View on Github external
config[key] = str(config[key])
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    if has_gpu:
        import torch

        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        if not texts:
            msg.fail("Input file is empty", texts_loc, exits=1)
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")
github explosion / spaCy / spacy / cli / train.py View on Github external
meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
github alexandrainst / danlp / danlp / datasets / wiki_ann.py View on Github external
no_print=True)

                all_sents = file_as_json[0]['paragraphs'][0]['sentences']

                random.seed(42)
                random.shuffle(all_sents)

                train_size = round(len(all_sents) * 0.7)
                train_sents = all_sents[:train_size]
                dev_sents = all_sents[train_size:]

                train_json = [{'id': 0, 'paragraphs': [{'sentences': train_sents}]}]
                dev_json = [{'id': 0, 'paragraphs': [{'sentences': dev_sents}]}]

                srsly.write_json(train_json_path, train_json)
                srsly.write_json(dev_json_path, dev_json)

        assert os.path.isfile(train_json_path) and os.path.isfile(train_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
github alexandrainst / danlp / danlp / datasets / ddt.py View on Github external
import srsly
        from spacy.cli.converters import conllu2json
        from spacy.gold import GoldCorpus
        from spacy.gold import Path

        for part in ['train', 'dev', 'test']:
            conll_path = os.path.join(self.dataset_dir, '{}.{}{}'.format(self.dataset_name, part, self.file_extension))
            json_path = os.path.join(self.dataset_dir, "ddt.{}.json".format(part))

            if not os.path.isfile(json_path):  # Convert the conllu files to json
                with open(conll_path, 'r') as file:
                    file_as_string = file.read()
                    file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter", "")
                    file_as_json = conllu2json(file_as_string)

                    srsly.write_json(json_path, file_as_json)

        train_json_path = os.path.join(self.dataset_dir, "ddt.train.json")
        dev_json_path = os.path.join(self.dataset_dir, "ddt.dev.json")

        assert os.path.isfile(train_json_path)
        assert os.path.isfile(dev_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
github explosion / spaCy / spacy / cli / train.py View on Github external
for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(
                                    nlp_loaded,
                                    gold_preproc=gold_preproc,
                                    ignore_misaligned=True,
                                )
                            )
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores