Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_tagged_corpus_downsample():
sentence = Sentence(
"I love Berlin.", labels=[Label("class_1")], use_tokenizer=segtok_tokenizer
)
corpus: Corpus = Corpus(
[
sentence,
sentence,
sentence,
sentence,
sentence,
sentence,
sentence,
sentence,
sentence,
sentence,
],
[],
[],
)
skip_header=skip_header,
**fmtparams,
)
else:
train_length = len(train)
dev_size: int = round(train_length / 10)
splits = random_split(train, [train_length - dev_size, dev_size])
train = splits[0]
dev = splits[1]
super(CSVClassificationCorpus, self).__init__(
train, dev, test, name=data_folder.name
)
class ParallelTextCorpus(Corpus):
def __init__(
self,
source_file: Union[str, Path],
target_file: Union[str, Path],
name: str = None,
use_tokenizer: bool = True,
max_tokens_per_doc=-1,
max_chars_per_doc=-1,
in_memory: bool = True,
):
"""
Instantiates a Corpus for text classification from CSV column formatted data
:param data_folder: base folder with the task data
:param train_file: the name of the train file
:param test_file: the name of the test file
dev_file = file
if "testa" in file_name:
dev_file = file
if "testb" in file_name:
test_file = file
log.info("Reading data from {}".format(data_folder))
log.info("Train: {}".format(train_file))
log.info("Dev: {}".format(dev_file))
log.info("Test: {}".format(test_file))
sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)
return Corpus(
sentences_train, sentences_dev, sentences_test, name=data_folder.name
)
def training_data_to_corpus(data_train: TrainingData):
sentences = []
for ex in data_train.training_examples:
sentence = Sentence(ex.text)
for token in sentence.tokens:
for entity in ex.get("entities"):
if (
token.start_pos >= entity["start"]
and token.end_pos <= entity["end"]
):
token.add_tag("ner", entity["entity"])
sentences.append(sentence)
return Corpus(
train=CustomDataset(sentences),
dev=CustomDataset(sentences),
test=CustomDataset([]),
)
)
else:
sentences_dev: List[Sentence] = [
sentences_train[i]
for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
]
sentences_train = [x for x in sentences_train if x not in sentences_dev]
if tag_to_biloes is not None:
# convert tag scheme to iobes
for sentence in sentences_train + sentences_test + sentences_dev:
sentence.convert_tag_scheme(
tag_type=tag_to_biloes, target_scheme="iobes"
)
return Corpus(
sentences_train, sentences_dev, sentences_test, name=data_folder.name
)
if dev_file is not None:
sentences_dev: List[
Sentence
] = NLPTaskDataFetcher.read_text_classification_file(
dev_file,
use_tokenizer=use_tokenizer,
max_tokens_per_doc=max_tokens_per_doc,
)
else:
sentences_dev: List[Sentence] = [
sentences_train[i]
for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
]
sentences_train = [x for x in sentences_train if x not in sentences_dev]
return Corpus(sentences_train, sentences_dev, sentences_test)
from flair.data import (
Sentence,
Corpus,
Token,
FlairDataset,
DataPair,
Image,
space_tokenizer,
segtok_tokenizer,
)
from flair.file_utils import cached_path, unzip_file
log = logging.getLogger("flair")
class ColumnCorpus(Corpus):
def __init__(
self,
data_folder: Union[str, Path],
column_format: Dict[int, str],
train_file=None,
test_file=None,
dev_file=None,
tag_to_bioes=None,
comment_symbol: str = None,
in_memory: bool = True,
encoding: str = "utf-8",
document_separator_token: str = None,
):
"""
Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
return ConcatDataset([self.train, self.dev, self.test])
def make_tag_dictionary(self, tag_type: str) -> Dictionary:
# Make the tag dictionary
tag_dictionary: Dictionary = Dictionary()
tag_dictionary.add_item("O")
for sentence in self.get_all_sentences():
for token in sentence.tokens:
tag_dictionary.add_item(token.get_tag(tag_type).value)
tag_dictionary.add_item("")
tag_dictionary.add_item("