Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __new__(self, base, entity=None, groupby=None, parent_entity=None,
primitive=None, use_previous=None, where=None):
# either direct or indentity
if primitive is None and entity is None:
return IdentityFeature(base)
elif primitive is None and entity is not None:
return DirectFeature(base, entity)
elif primitive is not None and parent_entity is not None:
assert isinstance(primitive, AggregationPrimitive) or issubclass(primitive, AggregationPrimitive)
return AggregationFeature(base, parent_entity=parent_entity,
use_previous=use_previous, where=where,
primitive=primitive)
elif primitive is not None:
assert (isinstance(primitive, TransformPrimitive) or
issubclass(primitive, TransformPrimitive))
if groupby is not None:
return GroupByTransformFeature(base,
primitive=primitive,
groupby=groupby)
return TransformFeature(base, primitive=primitive)
raise Exception("Unrecognized feature initialization")
import nltk
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from .utilities import clean_tokens
class PolarityScore(TransformPrimitive):
"""Calculates the polarity of a text on a scale from -1 (negative) to 1 (positive)
Description:
Given a list of strings assign a polarity score from -1 (negative text),
to 0 (neutral text), to 1 (positive text). The functions returns a score
for every given piece of text. If a string is missing, return 'NaN'
Examples:
>>> x = ['He loves dogs', 'She hates cats', 'There is a dog', '']
>>> polarity_score = PolarityScore()
>>> polarity_score(x).tolist()
[0.677, -0.649, 0.0, 0.0]
"""
name = "polarity_score"
input_types = [Text]
return_type = Numeric
import nltk
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text
from .utilities import clean_tokens
class PartOfSpeechCount(TransformPrimitive):
"""Calculates the occurences of each different part of speech.
Description:
Given a list of strings, categorize each word in the string as
a different part of speech, and return the total count for each
of 15 different categories of speech.
If a string is missing, return `NaN`.
Examples:
>>> x = ['He was eating cheese', '']
>>> part_of_speech_count = PartOfSpeechCount()
>>> part_of_speech_count(x).tolist()
[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0]]
"""
name = "part_of_speech_count"
# -*- coding: utf-8 -*-
import numpy as np
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text
class UpperCaseCount(TransformPrimitive):
"""Calculates the number of upper case letters in text.
Description:
Given a list of strings, determine the number of characters in each string
that are capitalized. Counts every letter individually, not just every
word that contains capitalized letters.
If a string is missing, return `NaN`
Examples:
>>> x = ['This IS a string.', 'This is a string', 'aaa']
>>> upper_case_count = UpperCaseCount()
>>> upper_case_count(x).tolist()
[3.0, 1.0, 0.0]
"""
name = "upper_case_count"
>>> cum_mean([1, 2, 3, 4, None, 5]).tolist()
[1.0, 1.5, 2.0, 2.5, nan, 2.5]
"""
name = "cum_mean"
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True
def get_function(self):
def cum_mean(values):
return values.cumsum() / np.arange(1, len(values) + 1)
return cum_mean
class CumMin(TransformPrimitive):
"""Calculates the cumulative minimum.
Description:
Given a list of values, return the cumulative min
(or running min). There is no set window, so the min
at each point is calculated over all prior values.
`NaN` values will return `NaN`, but in the window of a
cumulative caluclation, they're ignored.
Examples:
>>> cum_min = CumMin()
>>> cum_min([1, 2, -3, 4, None, 5]).tolist()
[1.0, 1.0, -3.0, -3.0, nan, -3.0]
"""
name = "cum_min"
input_types = [Numeric]
import nltk
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from .utilities import clean_tokens
class LSA(TransformPrimitive):
"""Calculates the Latent Semantic Analysis Values of Text Input
Description:
Given a list of strings, transforms those strings using tf-idf and single
value decomposition to go from a sparse matrix to a compact matrix with two
values for each string. These values represent that Latent Semantic Analysis
of each string. These values will represent their context with respect to
(nltk's brown sentence corpus.)[https://www.nltk.org/book/ch02.html#brown-corpus]
If a string is missing, return `NaN`.
Examples:
>>> lsa = LSA()
>>> x = ["he helped her walk,", "me me me eat food", "the sentence doth long"]
>>> res = lsa(x).tolist()
>>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text
from .utilities import clean_tokens
class DiversityScore(TransformPrimitive):
"""Calculates the overall complexity of the text based on the total
number of words used in the text
Description:
Given a list of strings, calculates the total number of unique words
divided by the total number of words in order to give the text a score
from 0-1 that indicates how unique the words used in it are. This
primitive only evaluates the 'clean' versions of strings, so ignoring cases,
punctuation, and stopwords in its evaluation.
If a string is missing, return `NaN`
Examples:
>>> diversity_score = DiversityScore()
>>> diversity_score(["hi hi hi", "hello its me", "hey what hey what", "a dog ate a basket"]).tolist()
[0.3333333333333333, 1.0, 0.5, 1.0]
>>> cum_sum([1, 2, 3, 4, None, 5]).tolist()
[1.0, 3.0, 6.0, 10.0, nan, 15.0]
"""
name = "cum_sum"
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True
def get_function(self):
def cum_sum(values):
return values.cumsum()
return cum_sum
class CumCount(TransformPrimitive):
"""Calculates the cumulative count.
Description:
Given a list of values, return the cumulative count
(or running count). There is no set window, so the
count at each point is calculated over all prior
values. `NaN` values are counted.
Examples:
>>> cum_count = CumCount()
>>> cum_count([1, 2, 3, 4, None, 5]).tolist()
[1, 2, 3, 4, 5, 6]
"""
name = "cum_count"
input_types = [[Id], [Discrete]]
return_type = Numeric
# for python 2.7
module = imp.load_source(module, filepath)
else:
# TODO: what is the first argument"?
# for python >3.5
spec = importlib.util.spec_from_file_location(module, filepath)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
primitives = []
for primitive_name in vars(module):
primitive_class = getattr(module, primitive_name)
if (isclass(primitive_class) and
issubclass(primitive_class, PrimitiveBase) and
primitive_class not in (AggregationPrimitive,
TransformPrimitive)):
primitives.append((primitive_name, primitive_class))
if len(primitives) == 0:
raise RuntimeError("No primitive defined in file %s" % filepath)
elif len(primitives) > 1:
raise RuntimeError("More than one primitive defined in file %s" % filepath)
return primitives[0]
# -*- coding: utf-8 -*-
import numpy as np
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text
class TitleWordCount(TransformPrimitive):
"""Determines the number of title words in a string.
Description:
Given list of strings, determine the number of title words
in each string. A title word is defined as any word starting
with a capital letter. Words at the start of a sentence will
be counted.
If a string is missing, return `NaN`.
Examples:
>>> x = ['My favorite movie is Jaws.', 'this is a string', 'AAA']
>>> title_word_count = TitleWordCount()
>>> title_word_count(x).tolist()
[2.0, 0.0, 1.0]
"""