How to use the featuretools.variable_types.Text function in featuretools

To help you get started, we’ve selected a few featuretools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github FeatureLabs / nlp_primitives / nlp_primitives / lsa.py View on Github external
[[0.0, 0.0, 0.01], [0.0, 0.0, 0.0]]

        Now, if we change the values of the input corpus, to something that better resembles
        the given text, the same given input text will result in a different, more discerning,
        output. Also, NaN values are handled, as well as strings without words.

        >>> lsa = LSA()
        >>> x = ["the earth is round", "", np.NaN, ".,/"]
        >>> res = lsa(x).tolist()
        >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
        >>> res
        [[0.01, 0.0, nan, 0.0], [0.0, 0.0, nan, 0.0]]

    """
    name = "lsa"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def __init__(self):
        # TODO: allow user to use own corpus
        self.number_output_features = 2
        self.n = 2

        try:
            brown = nltk.corpus.brown.sents()
        except LookupError:
            nltk.download('brown')
            brown = nltk.corpus.brown.sents()
        finally:
            self.trainer = make_pipeline(TfidfVectorizer(), TruncatedSVD())
            self.trainer.fit([" ".join(sent) for sent in brown])
github FeatureLabs / nlp_primitives / nlp_primitives / part_of_speech_count.py View on Github external
Description:
        Given a list of strings, categorize each word in the string as
        a different part of speech, and return the total count for each
        of 15 different categories of speech.

        If a string is missing, return `NaN`.

    Examples:
        >>> x = ['He was eating cheese', '']
        >>> part_of_speech_count = PartOfSpeechCount()
        >>> part_of_speech_count(x).tolist()
        [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0]]
    """
    name = "part_of_speech_count"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def __init__(self):
        self.number_output_features = 15
        self.n = 15

    def get_function(self):
        types = ['C', 'D', 'E', 'F', 'I', 'J',
                 'L', 'M', 'N', 'P', 'R', 'T',
                 'U', 'V', 'W']

        def part_of_speech_count(x):
            try:
                nltk.pos_tag(" ")
            except LookupError:
github FeatureLabs / featuretools / featuretools / primitives / standard / transform_primitive.py View on Github external
return lambda array: pd.Series(array).fillna('').str.len()


class NumWords(TransformPrimitive):
    """Determines the number of words in a string by counting the spaces.

    Examples:
        >>> num_words = NumWords()
        >>> num_words(['This is a string',
        ...            'Two words',
        ...            'no-spaces',
        ...            'Also works with sentences. Second sentence!']).tolist()
        [4, 2, 1, 6]
    """
    name = 'num_words'
    input_types = [Text]
    return_type = Numeric

    def get_function(self):
        def word_counter(array):
            return pd.Series(array).fillna('').str.count(' ') + 1
        return word_counter


class TimeSince(TransformPrimitive):
    """Calculates time from a value to a specified cutoff datetime.

    Args:
        unit (str): Defines the unit of time to count from.
            Defaults to Seconds. Acceptable values:
            years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
github FeatureLabs / nlp_primitives / nlp_primitives / universal_sentence_encoder.py View on Github external
Args:
        None

    Examples:
        >>> sentences = ["I like to eat pizza", "The roller coaster was built in 1885.", ""]
        >>> output = universal_sentence_encoder(sentences)
        >>> len(output)
        512
        >>> len(output[0])
        3
        >>> values = output[:3, 0]
        >>> [round(x, 4) for x in values]
        [0.0178, 0.0616, -0.0089]
    """
    name = "universal_sentence_encoder"
    input_types = [Text]
    return_type = Numeric

    def __init__(self):
        tf.compat.v1.disable_eager_execution()
        self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
        self.embed = hub.Module(self.module_url)
        self.number_output_features = 512
        self.n = 512

    def get_function(self):
        def universal_sentence_encoder(col):
            with tf.compat.v1.Session() as session:
                session.run([tf.compat.v1.global_variables_initializer(),
                             tf.compat.v1.tables_initializer()])
                embeddings = session.run(self.embed(col.tolist()))
            return embeddings.transpose()
github FeatureLabs / nlp_primitives / nlp_primitives / diversity_score.py View on Github external
Description:
        Given a list of strings, calculates the total number of unique words
        divided by the total number of words in order to give the text a score
        from 0-1 that indicates how unique the words used in it are. This
        primitive only evaluates the 'clean' versions of strings, so ignoring cases,
        punctuation, and stopwords in its evaluation.

        If a string is missing, return `NaN`

    Examples:
        >>> diversity_score = DiversityScore()
        >>> diversity_score(["hi hi hi", "hello its me", "hey what hey what", "a dog ate a basket"]).tolist()
        [0.3333333333333333, 1.0, 0.5, 1.0]
    """
    name = "diversity_score"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def get_function(self):

        def diversity_score(x):
            li = []
            for el in x:
                if pd.isnull(el):
                    li.append(np.nan)
                else:
                    el = clean_tokens(el)
                    if len(el) < 1:
                        li.append(0.0)
                    else:
                        li.append(float(len(set(el))) / float(len(el)))
github FeatureLabs / nlp_primitives / nlp_primitives / stopword_count.py View on Github external
Description:
        Given list of strings, determine the number of stopwords
        characters in each string. Looks for any of the English
        stopwords defined in `nltk.corpus.stopwords`. Case insensitive.

        If a string is missing, return `NaN`.

    Examples:
        >>> x = ['This is a test string.', 'This is second string', 'third string']
        >>> stopword_count = StopwordCount()
        >>> stopword_count(x).tolist()
        [3, 2, 0]
    """
    name = "stopword_count"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def get_function(self):

        def stopword_count(array):
            li = []
            try:
                swords = set(nltk.corpus.stopwords.words('english'))
            except LookupError:
                nltk.download('stopwords')
                swords = set(nltk.corpus.stopwords.words('english'))
            try:
                tokenizer = nltk.tokenize.word_tokenize
            except LookupError:
                nltk.download('punkt')
github FeatureLabs / featuretools / featuretools / utils / entity_utils.py View on Github external
elif df[variable].dtype == "object":
            if not len(df[variable]):
                inferred_type = vtypes.Categorical
            elif col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Categorical

                # heuristics to predict this some other than categorical
                sample = df[variable].sample(min(10000, len(df[variable])))

                # catch cases where object dtype cannot be interpreted as a string
                try:
                    avg_length = sample.str.len().mean()
                    if avg_length > 50:
                        inferred_type = vtypes.Text
                except AttributeError:
                    pass

        elif df[variable].dtype == "bool":
            inferred_type = vtypes.Boolean

        elif pdtypes.is_categorical_dtype(df[variable].dtype):
            inferred_type = vtypes.Categorical

        elif pdtypes.is_numeric_dtype(df[variable].dtype):
            inferred_type = vtypes.Numeric

        elif col_is_datetime(df[variable]):
            inferred_type = vtypes.Datetime

        elif len(df[variable]):
github FeatureLabs / nlp_primitives / nlp_primitives / title_word_count.py View on Github external
Description:
        Given list of strings, determine the number of title words
        in each string. A title word is defined as any word starting
        with a capital letter. Words at the start of a sentence will
        be counted.

        If a string is missing, return `NaN`.

    Examples:
        >>> x = ['My favorite movie is Jaws.', 'this is a string', 'AAA']
        >>> title_word_count = TitleWordCount()
        >>> title_word_count(x).tolist()
        [2.0, 0.0, 1.0]
    """
    name = "title_word_count"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def get_function(self):
        pattern = r'([A-Z][^\s]*)'

        def title_word_count(x):
            x = x.reset_index(drop=True)
            counts = x.str.extractall(pattern).groupby(level=0).count()[0]
            counts = counts.reindex_like(x).fillna(0)
            counts[x.isnull()] = np.nan
            return counts.astype(float)
        return title_word_count
github FeatureLabs / nlp_primitives / nlp_primitives / punctuation_count.py View on Github external
Description:
        Given list of strings, determine the number of punctuation
        characters in each string. Looks for any of the following:

        !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~

        If a string is missing, return `NaN`.

    Examples:
        >>> x = ['This is a test file.', 'This is second line', 'third line: $1,000']
        >>> punctuation_count = PunctuationCount()
        >>> punctuation_count(x).tolist()
        [1.0, 0.0, 3.0]
    """
    name = "punctuation_count"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def get_function(self):
        pattern = "(%s)" % '|'.join([re.escape(x) for x in string.punctuation])

        def punctuation_count(x):
            x = x.reset_index(drop=True)
            counts = x.str.extractall(pattern).groupby(level=0).count()[0]
            counts = counts.reindex_like(x).fillna(0)
            counts[x.isnull()] = np.nan
            return counts.astype(float)
        return punctuation_count
github FeatureLabs / nlp_primitives / nlp_primitives / polarity_score.py View on Github external
class PolarityScore(TransformPrimitive):
    """Calculates the polarity of a text on a scale from -1 (negative) to 1 (positive)

    Description:
        Given a list of strings assign a polarity score from -1 (negative text),
        to 0 (neutral text), to 1 (positive text). The functions returns a score
        for every given piece of text. If a string is missing, return 'NaN'

    Examples:
        >>> x = ['He loves dogs', 'She hates cats', 'There is a dog', '']
        >>> polarity_score = PolarityScore()
        >>> polarity_score(x).tolist()
        [0.677, -0.649, 0.0, 0.0]
    """
    name = "polarity_score"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def get_function(self):
        dtk = TreebankWordDetokenizer()

        def polarity_score(x):
            try:
                vader = SentimentIntensityAnalyzer()
            except LookupError:
                nltk.download('vader_lexicon')
                vader = SentimentIntensityAnalyzer()
            li = []

            def vader_pol(sentence):
                return (vader.polarity_scores(sentence)['pos'] -