How to use the danlp.datasets.DDT function in danlp

To help you get started, we’ve selected a few danlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alexandrainst / danlp / tests / test_datasets.py View on Github external
def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset
github alexandrainst / danlp / tests / test_datasets.py View on Github external
def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [paragraph[1] for paragraph in list(corpus.train_tuples)]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)
# temporary omitted due to changes in storage
github alexandrainst / danlp / examples / benchmarks / ner_benchmarks.py View on Github external
def is_misc(ent: str):
    if len(ent) < 4:
        return False
    return ent[-4:] == 'MISC'


def remove_miscs(se: list):
    return [
        [entity if not is_misc(entity) else 'O' for entity in entities]
        for entities in se
    ]


# Load the DaNE data
_, _, test = DDT().load_as_simple_ner(predefined_splits=True)
sentences_tokens, sentences_entities = test

# Replace MISC with O for fair comparisons
sentences_entities = remove_miscs(sentences_entities)

num_sentences = len(sentences_tokens)
num_tokens = sum([len(s) for s in sentences_tokens])


def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot