Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_load_sequence_labeling_data(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
)
assert len(corpus.train) == 6
assert len(corpus.dev) == 1
assert len(corpus.test) == 1
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
corpus_1 = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
)
corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_tag_dictionary("ner")
embeddings = WordEmbeddings("turian")
model: SequenceTagger = SequenceTagger(
hidden_size=64,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type="ner",
use_crf=False,
)
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
corpus_1 = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
)
corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_tag_dictionary("ner")
embeddings = WordEmbeddings("turian")
tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type="ner",
use_crf=False,
)
dev_file,
column_format,
tag_to_bioes,
encoding=encoding,
comment_symbol=comment_symbol,
in_memory=in_memory,
document_separator_token=document_separator_token,
)
else:
train_length = len(train)
dev_size: int = round(train_length / 10)
splits = random_split(train, [train_length - dev_size, dev_size])
train = splits[0]
dev = splits[1]
super(ColumnCorpus, self).__init__(train, dev, test, name=data_folder.name)
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name
# download data if necessary
_download_wikiner("pl", dataset_name)
super(WIKINER_POLISH, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
)
class WIKINER_RUSSIAN(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = False,
):
if type(base_path) == str:
base_path: Path = Path(base_path)
# column format
columns = {0: "text", 1: "pos", 2: "ner"}
# this dataset name
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name
# download data if necessary
_download_wikiner("ru", dataset_name)
super(WIKINER_RUSSIAN, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
)
class WNUT_17(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
):
if type(base_path) == str:
base_path: Path = Path(base_path)
# column format
columns = {0: "text", 1: "ner"}
# this dataset name
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name
# download data if necessary
_download_wikiner("nl", dataset_name)
super(WIKINER_DUTCH, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
)
class WIKINER_FRENCH(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = False,
):
if type(base_path) == str:
base_path: Path = Path(base_path)
# column format
columns = {0: "text", 1: "pos", 2: "ner"}
# this dataset name
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name
# download data if necessary
_download_wikiner("en", dataset_name)
super(WIKINER_ENGLISH, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
)
class WIKINER_GERMAN(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = False,
):
if type(base_path) == str:
base_path: Path = Path(base_path)
# column format
columns = {0: "text", 1: "pos", 2: "ner"}
# this dataset name
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
# download data if necessary
conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
super(CONLL_03_SPANISH, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
encoding="latin-1",
in_memory=in_memory,
)
class CONLL_2000(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "np",
in_memory: bool = True,
):
"""
Initialize the CoNLL-2000 corpus for English chunking.
The first time you call this constructor it will automatically download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
:param in_memory: If True, keeps dataset in memory giving speedups in training.
"""
if type(base_path) == str:
base_path: Path = Path(base_path)
image = Image(imageURL=image_info["url"])
for caption in image_info["descriptions"]:
# append Sentence-Image data point
self.data_points.append(
DataPair(Sentence(preprocessor(caption), use_tokenizer=True), image)
)
self.split.append(int(image_info["split"]))
def __len__(self):
return len(self.data_points)
def __getitem__(self, index: int = 0) -> DataPair:
return self.data_points[index]
class CONLL_03(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
document_as_sequence: bool = False,
):
"""
Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine.
Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put it into some folder. Then point
the base_path parameter in the constructor to this folder
:param base_path: Path to the CoNLL-03 corpus on your machine
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
POS tags or chunks respectively
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object