How to use textstat - 10 common examples

To help you get started, we’ve selected a few textstat examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github AWegnerGitHub / SE_Zephyr_VoteRequest_bot / utils / utils.py View on Github external
num_letters = sum(1 for c in no_code_text if c.isalpha())
    num_numbers = sum(1 for c in no_code_text if c.isdigit())
    num_alphanum = sum(1 for c in no_code_text if c.isalnum())
    num_otherchars = num_chars - num_alphanum
    results.append(TextFeature('Number of characters', num_chars, group_by))
    results.append(TextFeature('Number of letters', num_letters, group_by))
    results.append(TextFeature('Number of numbers', num_numbers, group_by))
    results.append(TextFeature('Number of other characters', num_otherchars, group_by))
    character_counts = Counter(no_code_text.lower())
    for c in sorted(character_counts.items()):
        try:
            results.append(TextFeature('Character count for "{}"'.format(c[0].encode('unicode_escape')), c[1], group_by))
        except AttributeError:
            results.append(TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by))

    results.append(TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by))
    results.append(TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by))
    results.append(TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by))
    results.append(TextFeature('Number of lower case characters', num_lower, group_by))
    results.append(TextFeature('Number of upper case characters', num_upper, group_by))
    return results
github AWegnerGitHub / SE_Zephyr_VoteRequest_bot / utils / utils.py View on Github external
num_numbers = sum(1 for c in no_code_text if c.isdigit())
    num_alphanum = sum(1 for c in no_code_text if c.isalnum())
    num_otherchars = num_chars - num_alphanum
    results.append(TextFeature('Number of characters', num_chars, group_by))
    results.append(TextFeature('Number of letters', num_letters, group_by))
    results.append(TextFeature('Number of numbers', num_numbers, group_by))
    results.append(TextFeature('Number of other characters', num_otherchars, group_by))
    character_counts = Counter(no_code_text.lower())
    for c in sorted(character_counts.items()):
        try:
            results.append(TextFeature('Character count for "{}"'.format(c[0].encode('unicode_escape')), c[1], group_by))
        except AttributeError:
            results.append(TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by))

    results.append(TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by))
    results.append(TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by))
    results.append(TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by))
    results.append(TextFeature('Number of lower case characters', num_lower, group_by))
    results.append(TextFeature('Number of upper case characters', num_upper, group_by))
    return results
github bburns / LanguageModels / src / wp / data.py View on Github external
for filepath in glob.glob(self.cleaned_files):
            # with open(filepath, 'rb') as f:
            with open(filepath, 'r') as f:
                s = f.read()
                s = s.lower()
                words = s.split(' ')
                filetitle = util.filetitle(filepath)
                sentences = tokenize.sent_tokenize(s)
                nchars = len(s)
                nwords = len(words)
                nsentences = len(sentences)
                ncharsword = round(nchars/nwords,1)
                nwordssentence = round(nwords/nsentences,1)
                nuniquewords = len(set(words))
                uniquerate = nuniquewords / nwords
                grade_level = int(round(textstat.coleman_liau_index(s)))
                row = [filetitle, nchars, nwords, nsentences, ncharsword, nwordssentence, nuniquewords, uniquerate, grade_level]
                rows.append(row)
        nchars = sum([row[1] for row in rows])
        nwords = sum([row[2] for row in rows])
        row = ['Totals',nchars, nwords, '','','','','','']
        rows.append(row)
        df = pd.DataFrame(rows, columns=cols)
        df = df.drop('Chars',axis=1) # not enough space...
        df = df.drop('Sentences',axis=1)
        df = df.drop('Unique Rate',axis=1)
        return df
github ourresearch / oadoi / models / badge.py View on Github external
def decide_if_assigned_threshold(self, person, threshold):
        reading_levels = {}
        for my_product in person.all_products:
            text = ""
            if my_product.title:
                text += u" " + my_product.title
            if my_product.get_abstract_using_mendeley():
                text += u" " + my_product.get_abstract_using_mendeley()

            # only do if at least three words between periods,
            # otherwise textstat library prints too many Not Enough Words error messages
            if text:
                sentences = text.split(".")
                if any([len(sentence.split())>3 for sentence in sentences]):
                    try:
                        grade_level = textstat.flesch_kincaid_grade(text)
                        # print u"grade level is {} for {}; text: {}".format(grade_level, my_product.doi, text)
                        if grade_level > 0:
                            # is sometimes negative, strangely.  examples in ethan's profile
                            reading_levels[my_product.doi] = grade_level
                    except TypeError:  #if text is too short it thows this
                        pass

        if reading_levels.values():
            average_reading_level = sum(reading_levels.values()) / float(len(reading_levels))
            self.candidate_badge.value = average_reading_level
            self.assigned = True
github wikimedia / revscoring / revscoring / features / wikitext / features / parsed.py View on Github external
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.content_chars = aggregators.len(
            self.datasources.content,
            name=self._name + ".content_chars"
        )
        """
        `int` : The number of characters of viewable content (no markup or
        templates
        """

        self.flesh_kincaid = Feature(
            self._name + ".flesh_kincaid",
            textstat.flesch_reading_ease,
            depends_on=[self.datasources.content],
            returns=float
        )
        """
        `float` : returns the Flesch reading ease score.
        (https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)
        """

        self.headings = aggregators.len(
            self.datasources.headings,
            name=self._name + ".headings"
        )
        "`int` : The number of headings"

        self.external_links = aggregators.len(
            self.datasources.external_links,
github shivam5992 / textstat / textstat / __init__.py View on Github external
from .textstat import textstat


__version__ = (0, 5, 6)


for attribute in dir(textstat):
    if callable(getattr(textstat, attribute)):
        if not attribute.startswith("_"):
            globals()[attribute] = getattr(textstat, attribute)
github AWegnerGitHub / SE_Zephyr_VoteRequest_bot / utils / utils.py View on Github external
results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The SMOG Index', "Undetermined", group_by))
    results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by))
    results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by))
    try:
        results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by))
    except IndexError:
        results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by))
    try:
        results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by))

    try:
        results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by))
    except (TypeError, IndexError):
        results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by))
    return results
github bburns / LanguageModels / src / wp / data.py View on Github external
def readability(self):
        """
        Return grade school readability level of the text, using consensus of several tests.
        """
        s = self.text('merged')
        # grade_level = textstat.text_standard(s)
        # grade_level = textstat.smog_index(s)
        # grade_level = textstat.gunning_fog(s)
        grade_level = textstat.coleman_liau_index(s)
        grade_level = round(grade_level,1)
        return grade_level
github shivam5992 / textstat / textstat / __init__.py View on Github external
from .textstat import textstat


__version__ = (0, 5, 6)


for attribute in dir(textstat):
    if callable(getattr(textstat, attribute)):
        if not attribute.startswith("_"):
            globals()[attribute] = getattr(textstat, attribute)
github GauravBh1010tt / DeepLearn / TrecQA_CNN+Sim / dl_text / rd_ft.py View on Github external
def DaleChall(text):
    return textstat.dale_chall_readability_score(text)