Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extract_zip(filename: str, destination_dir: str):
""" Extracts a zipped file
Parameters
----------
filename : str
The zipped filename
destination_dir : str
The directory where the zipped will be placed
"""
msg_printer = Printer()
try:
with msg_printer.loading(f"Unzipping file {filename} to {destination_dir}"):
stdout.flush()
with zipfile.ZipFile(filename, "r") as z:
z.extractall(destination_dir)
msg_printer.good(f"Finished extraction {filename} to {destination_dir}")
except zipfile.BadZipFile:
msg_printer.fail(f"Couldnot extract {filename} to {destination_dir}")
def __init__(
self,
datasets_manager: DatasetsManager = None,
predicted_tags_namespace_prefix="predicted_tags",
):
super(TokenClassificationAccuracy, self).__init__(
datasets_manager=datasets_manager
)
self.datasets_manager = datasets_manager
self.label_namespaces = datasets_manager.label_namespaces
self.predicted_tags_namespace_prefix = predicted_tags_namespace_prefix
self.msg_printer = wasabi.Printer()
self.classification_metrics_utils = ClassificationMetricsUtils()
# a mapping between namespace and tp_counters for every class
self.tp_counter: Dict[str, Dict[str, Any]] = defaultdict(dict)
self.fp_counter: Dict[str, Dict[str, Any]] = defaultdict(dict)
self.fn_counter: Dict[str, Dict[str, Any]] = defaultdict(dict)
self.tn_counter: Dict[str, Dict[str, Any]] = defaultdict(dict)
model: nn.Module,
model_filepath: str,
datasets_manager: DatasetsManager,
device: Optional[Union[str, torch.device]] = torch.device("cpu"),
predicted_tags_namespace_prefix: str = "predicted_tags",
):
super(SequenceLabellingInference, self).__init__(
model=model,
model_filepath=model_filepath,
datasets_manager=datasets_manager,
device=device,
)
self.predicted_tags_namespace_prefix = predicted_tags_namespace_prefix
self.labels_namespaces = self.datasets_manager.label_namespaces
self.msg_printer = wasabi.Printer()
self.metrics_calculator = TokenClassificationAccuracy(
datasets_manager=datasets_manager
)
# The key is the namespace of different labels
# The value is a dictioary of label->idx
self.label2idx_mapping: Dict[str, Dict[str, Any]] = {}
self.idx2label_mapping: Dict[str, Dict[str, Any]] = {}
for namespace in self.labels_namespaces:
self.label2idx_mapping[
namespace
] = self.datasets_manager.get_label_idx_mapping(label_namespace=namespace)
self.idx2label_mapping[
namespace
] = self.datasets_manager.get_idx_label_mapping(label_namespace=namespace)
def __init__(
self,
token2idx: Dict,
embedding_type: Union[str, None] = None,
embedding_dimension: Union[str, None] = None,
):
self.token2idx = token2idx
self.embedding_type = embedding_type
self.embedding_dimension = embedding_dimension
self.msg_printer = wasabi.Printer()
self.vocab_embedding = self.load_embedding()
def __init__(
self,
datasets_manager: DatasetsManager,
predicted_tags_namespace_prefix="predicted_tags",
words_namespace: str = "tokens",
):
super(ConLL2003Metrics, self).__init__(datasets_manager=datasets_manager)
self.datasets_manager = datasets_manager
self.label_namespaces = datasets_manager.label_namespaces
self.words_namespace = words_namespace
self.namespace_to_vocab = self.datasets_manager.namespace_to_vocab
self.predicted_tags_namespace_prefix = predicted_tags_namespace_prefix
self.msg_printer = wasabi.Printer()
self.acc_counter: Dict[str, List[float]] = defaultdict(list)
self.precision_counter: Dict[str, List[float]] = defaultdict(list)
self.recall_counter: Dict[str, List[float]] = defaultdict(list)
self.fmeasure_counter: Dict[str, List[float]] = defaultdict(list)
"""
self.classname2idx = self.get_classname2idx()
self.idx2classname = {
idx: classname for classname, idx in self.classname2idx.items()
}
self.filename = filename
self.train_size = train_size
self.test_size = test_size
self.validation_size = validation_size
self.dataset_type = dataset_type
self.debug = debug
self.debug_dataset_proportion = debug_dataset_proportion
self.max_instance_length = max_instance_length
self.lines, self.labels = self.get_lines_labels(filename=self.filename)
self.msg_printer = wasabi.Printer()
)
parser.add_argument(
"--sample_proportion", help="Sample proportion of the dataset", type=float
)
parser.add_argument(
"--num_layers", help="Number of layers in rnn2seq encoder", type=int
)
parser.add_argument(
"--add_projection_layer",
help="Add projection layer in rnn2seq encoder",
action="store_true",
)
args = parser.parse_args()
msg_printer = wasabi.Printer()
data_dir = pathlib.Path(DATA_DIR)
train_filename = data_dir.joinpath("conll_bioul.train")
dev_filename = data_dir.joinpath("conll_bioul.dev")
test_filename = data_dir.joinpath("conll_bioul.test")
instance_preprocessing = InstancePreprocessing()
data_manager = CoNLLDatasetManager(
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
column_names=["POS", "DEP", "NER"],
train_only="ner",
namespace_vocab_options={
"tokens": {"preprocessing_pipeline": [instance_preprocessing.lowercase]}
utilities. For more information on the dataset you can refer to https://scienceie.github.io/
Parameters
----------
folderpath : pathlib.Path
The path where the ScienceIEDataset is stored
ignore_warnings : bool
If True, then all the warnings generated by this class for inconsistencies in the
data is ignored
"""
self.folderpath = folderpath
self.ignore_warning = ignore_warnings
self.entity_types = ["Process", "Material", "Task"]
self.file_ids = self.get_file_ids()
self.msg_printer = wasabi.Printer()
self.nlp = spacy.load("en_core_web_sm")
self._conll_col_sep = " "
def print_summary(nlp, pretty=True, no_print=False):
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
nlp (Language): The nlp object.
pretty (bool): Pretty-print the results (color etc).
no_print (bool): Don't print anything, just return the data.
RETURNS (dict): A dict with "overview" and "problems".
"""
msg = Printer(pretty=pretty, no_print=no_print)
overview = []
problems = {}
for i, (name, pipe) in enumerate(nlp.pipeline):
requires = getattr(pipe, "requires", [])
assigns = getattr(pipe, "assigns", [])
retok = getattr(pipe, "retokenizes", False)
overview.append((i, name, requires, assigns, retok))
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
msg.divider("Pipeline Overview")
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
msg.table(overview, header=header, divider=True, multiline=True)
n_problems = sum(len(p) for p in problems.values())
if any(p for p in problems.values()):
msg.divider("Problems ({})".format(n_problems))
for name, problem in problems.items():
if problem:
Sample format:
-DOCSTART- -X- O O
I O
like O
London B-GPE
and O
New B-GPE
York I-GPE
City I-GPE
. O
"""
msg = Printer(no_print=no_print)
doc_delimiter = "-DOCSTART- -X- O O"
# check for existing delimiters, which should be preserved
if "\n\n" in input_data and seg_sents:
msg.warn(
"Sentence boundaries found, automatic sentence segmentation with "
"`-s` disabled."
)
seg_sents = False
if doc_delimiter in input_data and n_sents:
msg.warn(
"Document delimiters found, automatic document segmentation with "
"`-n` disabled."
)
n_sents = 0
# do document segmentation with existing sentences
if "\n\n" in input_data and doc_delimiter not in input_data and n_sents: