Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import json
import pandas
import spacy
from time import sleep
from functools import partial
from multiprocessing import Process, Manager, Queue, Pool, Array
from medcat.cdb import CDB
from medcat.spacy_cat import SpacyCat
from medcat.preprocessing.tokenizers import spacy_split_all
from medcat.utils.spelling import CustomSpellChecker
from medcat.utils.spacy_pipe import SpacyPipe
from medcat.preprocessing.cleaners import spacy_tag_punct
from medcat.utils.helpers import get_all_from_name, tkn_inds_from_doc
from medcat.utils.loggers import basic_logger
log = basic_logger("CAT")
# Check scispacy models
from medcat.utils.helpers import check_scispacy
check_scispacy()
class CAT(object):
""" Annotate a dataset
"""
SEPARATOR = ""
NESTED_ENTITIES = os.getenv("NESTED_ENTITIES", 'false').lower() == 'true'
KEEP_PUNCT = os.getenv("KEEP_PUNCT", ":|.").split("|")
def __init__(self, cdb, vocab=None, skip_stopwords=True, meta_cats=[]):
self.cdb = cdb
self.vocab = vocab
# Build the required spacy pipeline
from functools import partial
from multiprocessing import Process, Manager, Queue, Pool, Array
from medcat.cdb import CDB
from medcat.spacy_cat import SpacyCat
from medcat.preprocessing.tokenizers import spacy_split_all
from medcat.utils.spelling import CustomSpellChecker
from medcat.utils.spacy_pipe import SpacyPipe
from medcat.preprocessing.cleaners import spacy_tag_punct
from medcat.utils.helpers import get_all_from_name, tkn_inds_from_doc
from medcat.utils.loggers import basic_logger
from medcat.utils.data_utils import make_mc_train_test
import time
import sys, traceback
from tqdm.autonotebook import tqdm
log = basic_logger("CAT")
class CAT(object):
r'''
The main MedCAT class used to annotate documents, it is built on top of spaCy
and works as a spaCy pipline. Creates an instance of a spaCy pipline that can
be used as a spacy nlp model.
Args:
cdb (medcat.cdb.CDB):
The concept database that will be used for NER+L
vocab (medcat.utils.vocab.Vocab, optional):
Vocabulary used for vector embeddings and spelling. Default: None
skip_stopwords (bool):
If True the stopwords will be ignored and not detected in the pipeline.
Default: True
meta_cats (list of medcat.meta_cat.MetaCAT, optional):
""" Representation class for CDB data
"""
import pickle
import numpy as np
from scipy.sparse import dok_matrix
#from gensim.matutils import unitvec
from medcat.utils.matutils import unitvec, sigmoid
from medcat.utils.attr_dict import AttrDict
from medcat.utils.loggers import basic_logger
import os
import pandas as pd
log = basic_logger("cdb")
class CDB(object):
""" Holds all the CDB data required for annotation
"""
MAX_COO_DICT_SIZE = int(os.getenv('MAX_COO_DICT_SIZE', 10000000))
MIN_COO_COUNT = int(os.getenv('MIN_COO_COUNT', 100))
def __init__(self):
self.index2cui = [] # A list containing all CUIs
self.cui2index = {} # Map from cui to index in the index2cui list
self.name2cui = {} # Converts a normalized concept name to a cui
self.name2cnt = {} # Converts a normalized concept name to a count
self.name_isunique = {} # Should this name be skipped
self.name2original_name = {} # Holds the two versions of a name
self.name2ntkns = {} # Number of tokens for this name
self.name_isupper = {} # Checks was this name all upper case in cdb
self.cui2desc = {} # Map between a CUI and its cdb description
from sklearn.model_selection import train_test_split
import numpy as np
from medcat.utils.models import LSTM as MODEL
from sklearn.metrics import classification_report, f1_score, confusion_matrix, precision_score, recall_score
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from medcat.utils.loggers import basic_logger
log = basic_logger("utils")
def get_batch(ind, batch_size, x, y, cpos, device):
# Get the start/end index for this batch
start = ind * batch_size
end = (ind+1) * batch_size
# Get the batch
x_batch = x[start:end]
y_batch = y[start:end]
c_batch = cpos[start:end]
# Return and move the batches to the right device
return x_batch.to(device), y_batch.to(device), c_batch.to(device)
from medcat.utils.vocab import Vocab
import numpy as np
import pandas
from medcat.preprocessing.tokenizers import spacy_split_all
from medcat.preprocessing.cleaners import spacy_tag_punct, clean_name, clean_def
from medcat.utils.spacy_pipe import SpacyPipe
from functools import partial
from medcat.utils.spelling import CustomSpellChecker
from gensim.models import Word2Vec
from medcat.preprocessing.iterators import SimpleIter
from medcat.utils.loggers import basic_logger
log = basic_logger("CAT")
class MakeVocab(object):
def __init__(self, cdb, vocab=None, word_tokenizer=None):
self.cdb = cdb
self.w2v = None
if vocab is not None:
self.vocab = vocab
else:
self.vocab = Vocab()
# Build the required spacy pipeline
self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser', 'vectors', 'textcat'])
# Get the tokenizer
if word_tokenizer is not None:
from spacy.tokens import Span
import numpy as np
import operator
from medcat.utils.loggers import basic_logger
from medcat.utils.matutils import unitvec
from medcat.utils.ml_utils import load_hf_tokenizer, build_vocab_from_hf
from spacy.lang.en.stop_words import STOP_WORDS
import os
log = basic_logger("spacycat")
# IF UMLS it includes specific rules that are only good for the Full UMLS version
if os.getenv('TYPE', 'other').lower() == 'umls':
log.info("Using cat_ann for annotations")
from medcat.cat_ann import CatAnn
else:
log.info("Using basic_cat_ann for annotations")
from medcat.basic_cat_ann import CatAnn
class SpacyCat(object):
""" A Spacy pipe module, can be easily added into a spacey pipline
cdb: the cdb object of class cat.cdb representing the concepts
vocab: vocab object of class cat.utils.vocab with vector representations