Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
[[0.0, 0.0, 0.01], [0.0, 0.0, 0.0]]
Now, if we change the values of the input corpus, to something that better resembles
the given text, the same given input text will result in a different, more discerning,
output. Also, NaN values are handled, as well as strings without words.
>>> lsa = LSA()
>>> x = ["the earth is round", "", np.NaN, ".,/"]
>>> res = lsa(x).tolist()
>>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
>>> res
[[0.01, 0.0, nan, 0.0], [0.0, 0.0, nan, 0.0]]
"""
name = "lsa"
input_types = [Text]
return_type = Numeric
default_value = 0
def __init__(self):
# TODO: allow user to use own corpus
self.number_output_features = 2
self.n = 2
try:
brown = nltk.corpus.brown.sents()
except LookupError:
nltk.download('brown')
brown = nltk.corpus.brown.sents()
finally:
self.trainer = make_pipeline(TfidfVectorizer(), TruncatedSVD())
self.trainer.fit([" ".join(sent) for sent in brown])
Description:
Given a list of strings, categorize each word in the string as
a different part of speech, and return the total count for each
of 15 different categories of speech.
If a string is missing, return `NaN`.
Examples:
>>> x = ['He was eating cheese', '']
>>> part_of_speech_count = PartOfSpeechCount()
>>> part_of_speech_count(x).tolist()
[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0]]
"""
name = "part_of_speech_count"
input_types = [Text]
return_type = Numeric
default_value = 0
def __init__(self):
self.number_output_features = 15
self.n = 15
def get_function(self):
types = ['C', 'D', 'E', 'F', 'I', 'J',
'L', 'M', 'N', 'P', 'R', 'T',
'U', 'V', 'W']
def part_of_speech_count(x):
try:
nltk.pos_tag(" ")
except LookupError:
return lambda array: pd.Series(array).fillna('').str.len()
class NumWords(TransformPrimitive):
"""Determines the number of words in a string by counting the spaces.
Examples:
>>> num_words = NumWords()
>>> num_words(['This is a string',
... 'Two words',
... 'no-spaces',
... 'Also works with sentences. Second sentence!']).tolist()
[4, 2, 1, 6]
"""
name = 'num_words'
input_types = [Text]
return_type = Numeric
def get_function(self):
def word_counter(array):
return pd.Series(array).fillna('').str.count(' ') + 1
return word_counter
class TimeSince(TransformPrimitive):
"""Calculates time from a value to a specified cutoff datetime.
Args:
unit (str): Defines the unit of time to count from.
Defaults to Seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
Args:
None
Examples:
>>> sentences = ["I like to eat pizza", "The roller coaster was built in 1885.", ""]
>>> output = universal_sentence_encoder(sentences)
>>> len(output)
512
>>> len(output[0])
3
>>> values = output[:3, 0]
>>> [round(x, 4) for x in values]
[0.0178, 0.0616, -0.0089]
"""
name = "universal_sentence_encoder"
input_types = [Text]
return_type = Numeric
def __init__(self):
tf.compat.v1.disable_eager_execution()
self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
self.embed = hub.Module(self.module_url)
self.number_output_features = 512
self.n = 512
def get_function(self):
def universal_sentence_encoder(col):
with tf.compat.v1.Session() as session:
session.run([tf.compat.v1.global_variables_initializer(),
tf.compat.v1.tables_initializer()])
embeddings = session.run(self.embed(col.tolist()))
return embeddings.transpose()
Description:
Given a list of strings, calculates the total number of unique words
divided by the total number of words in order to give the text a score
from 0-1 that indicates how unique the words used in it are. This
primitive only evaluates the 'clean' versions of strings, so ignoring cases,
punctuation, and stopwords in its evaluation.
If a string is missing, return `NaN`
Examples:
>>> diversity_score = DiversityScore()
>>> diversity_score(["hi hi hi", "hello its me", "hey what hey what", "a dog ate a basket"]).tolist()
[0.3333333333333333, 1.0, 0.5, 1.0]
"""
name = "diversity_score"
input_types = [Text]
return_type = Numeric
default_value = 0
def get_function(self):
def diversity_score(x):
li = []
for el in x:
if pd.isnull(el):
li.append(np.nan)
else:
el = clean_tokens(el)
if len(el) < 1:
li.append(0.0)
else:
li.append(float(len(set(el))) / float(len(el)))
Description:
Given list of strings, determine the number of stopwords
characters in each string. Looks for any of the English
stopwords defined in `nltk.corpus.stopwords`. Case insensitive.
If a string is missing, return `NaN`.
Examples:
>>> x = ['This is a test string.', 'This is second string', 'third string']
>>> stopword_count = StopwordCount()
>>> stopword_count(x).tolist()
[3, 2, 0]
"""
name = "stopword_count"
input_types = [Text]
return_type = Numeric
default_value = 0
def get_function(self):
def stopword_count(array):
li = []
try:
swords = set(nltk.corpus.stopwords.words('english'))
except LookupError:
nltk.download('stopwords')
swords = set(nltk.corpus.stopwords.words('english'))
try:
tokenizer = nltk.tokenize.word_tokenize
except LookupError:
nltk.download('punkt')
elif df[variable].dtype == "object":
if not len(df[variable]):
inferred_type = vtypes.Categorical
elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
else:
inferred_type = vtypes.Categorical
# heuristics to predict this some other than categorical
sample = df[variable].sample(min(10000, len(df[variable])))
# catch cases where object dtype cannot be interpreted as a string
try:
avg_length = sample.str.len().mean()
if avg_length > 50:
inferred_type = vtypes.Text
except AttributeError:
pass
elif df[variable].dtype == "bool":
inferred_type = vtypes.Boolean
elif pdtypes.is_categorical_dtype(df[variable].dtype):
inferred_type = vtypes.Categorical
elif pdtypes.is_numeric_dtype(df[variable].dtype):
inferred_type = vtypes.Numeric
elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
elif len(df[variable]):
Description:
Given list of strings, determine the number of title words
in each string. A title word is defined as any word starting
with a capital letter. Words at the start of a sentence will
be counted.
If a string is missing, return `NaN`.
Examples:
>>> x = ['My favorite movie is Jaws.', 'this is a string', 'AAA']
>>> title_word_count = TitleWordCount()
>>> title_word_count(x).tolist()
[2.0, 0.0, 1.0]
"""
name = "title_word_count"
input_types = [Text]
return_type = Numeric
default_value = 0
def get_function(self):
pattern = r'([A-Z][^\s]*)'
def title_word_count(x):
x = x.reset_index(drop=True)
counts = x.str.extractall(pattern).groupby(level=0).count()[0]
counts = counts.reindex_like(x).fillna(0)
counts[x.isnull()] = np.nan
return counts.astype(float)
return title_word_count
Description:
Given list of strings, determine the number of punctuation
characters in each string. Looks for any of the following:
!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
If a string is missing, return `NaN`.
Examples:
>>> x = ['This is a test file.', 'This is second line', 'third line: $1,000']
>>> punctuation_count = PunctuationCount()
>>> punctuation_count(x).tolist()
[1.0, 0.0, 3.0]
"""
name = "punctuation_count"
input_types = [Text]
return_type = Numeric
default_value = 0
def get_function(self):
pattern = "(%s)" % '|'.join([re.escape(x) for x in string.punctuation])
def punctuation_count(x):
x = x.reset_index(drop=True)
counts = x.str.extractall(pattern).groupby(level=0).count()[0]
counts = counts.reindex_like(x).fillna(0)
counts[x.isnull()] = np.nan
return counts.astype(float)
return punctuation_count
class PolarityScore(TransformPrimitive):
"""Calculates the polarity of a text on a scale from -1 (negative) to 1 (positive)
Description:
Given a list of strings assign a polarity score from -1 (negative text),
to 0 (neutral text), to 1 (positive text). The functions returns a score
for every given piece of text. If a string is missing, return 'NaN'
Examples:
>>> x = ['He loves dogs', 'She hates cats', 'There is a dog', '']
>>> polarity_score = PolarityScore()
>>> polarity_score(x).tolist()
[0.677, -0.649, 0.0, 0.0]
"""
name = "polarity_score"
input_types = [Text]
return_type = Numeric
default_value = 0
def get_function(self):
dtk = TreebankWordDetokenizer()
def polarity_score(x):
try:
vader = SentimentIntensityAnalyzer()
except LookupError:
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()
li = []
def vader_pol(sentence):
return (vader.polarity_scores(sentence)['pos'] -