How to use the nlp.nlp_utils function in nlp

To help you get started, we’ve selected a few nlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github melqkiades / yelp / source / python / etl / reviews_preprocessor.py View on Github external
def lemmatize_sentences(records):
        print('%s: lemmatize sentences' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        sentence_records = []
        record_index = 0
        document_level = Constants.DOCUMENT_LEVEL
        for record in records:
            sentences = \
                nlp_utils.get_sentences(record[Constants.TEXT_FIELD])
            sentence_index = 0
            for sentence in sentences:
                if isinstance(document_level, (int, float)) and\
                        sentence_index >= document_level:
                    break
                tagged_words = nlp_utils.lemmatize_sentence(sentence)
                sentence_record = {}
                sentence_record.update(record)
                sentence_record[Constants.TEXT_FIELD] = sentence
                sentence_record['sentence_index'] = sentence_index
                sentence_record[Constants.POS_TAGS_FIELD] = tagged_words
                sentence_records.append(sentence_record)
                sentence_index += 1
                # print(sentence_record)
            record_index += 1
            # print('\rrecord index: %d/%d' % (record_index, len(records))),
github melqkiades / yelp / source / python / topicmodeling / context / topic_latex_generator.py View on Github external
def build_text_automatic(self, record):
        text = record[Constants.TEXT_FIELD]
        sentences = nlp_utils.get_sentences(text)
        lemmatized_words = []
        for sentence in sentences:
            lemmatized_words.append(nlp_utils.lemmatize_sentence(
                sentence, nltk.re.compile(''),
                min_length=1, max_length=100))

        doc_parts = []
        itemize = Itemize()

        for sentence in lemmatized_words:
            new_words = []
            itemize.add_item('')
            for tagged_word in sentence:
                tag = tagged_word[1]
                word = tagged_word[0]
                singular = pattern.text.en.singularize(word)
github melqkiades / yelp / source / python / topicmodeling / context / review_metrics_extractor.py View on Github external
def get_review_metrics(record):
    """
    Returns a list with the metrics of a review. This list is composed
    in the following way: [log(num_sentences + 1), log(num_words + 1),
    log(num_past_verbs + 1), log(num_verbs + 1),
    (log(num_past_verbs + 1) / log(num_verbs + 1))

    :type record: dict
    :param record: the review that wants to be analyzed, it should contain the
    text of the review
    :rtype: list[float]
    :return: a list with numeric metrics
    """
    review_text = record[Constants.TEXT_FIELD]
    log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1)
    # log_time_words = math.log(len(self.get_time_words(review.text)) + 1)
    tagged_words = record[Constants.POS_TAGS_FIELD]
    log_words = math.log(len(tagged_words) + 1)
    counts = Counter(tag for word, tag, lemma in tagged_words)
    # print(counts)
    log_past_verbs = math.log(counts['VBD'] + 1)
    log_verbs = math.log(nlp_utils.count_verbs(counts) + 1)
    log_personal_pronouns = math.log(counts['PRP'] + 1)
    # log_sentences = float(len(get_sentences(review.text)) + 1)
    # log_words = float(len(get_words(review.text)) + 1)
    # log_time_words = float(len(self.get_time_words(review.text)) + 1)
    # tagged_words = review.tagged_words
    # counts = Counter(tag for word, tag in tagged_words)
    # log_past_verbs = float(counts['VBD'] + 1)
    # log_verbs = float(count_verbs(counts) + 1)
    # log_personal_pronouns = float(counts['PRP'] + 1)
github melqkiades / yelp / source / python / etl / reviews_preprocessor.py View on Github external
def lemmatize_reviews(records):
        """
        Performs a POS tagging on the text contained in the reviews and
        additionally finds the lemma of each word in the review

        :type records: list[dict]
        :param records: a list of dictionaries with the reviews
        """
        print('%s: lemmatize reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        record_index = 0
        for record in records:
            #

            tagged_words =\
                nlp_utils.lemmatize_text(record[Constants.TEXT_FIELD])

            record[Constants.POS_TAGS_FIELD] = tagged_words
            record_index += 1

        return records
        # print('')
github melqkiades / yelp / source / python / evaluation / classifier_evaluator.py View on Github external
def lemmatize_reviews(records):
    """
    Performs a POS tagging on the text contained in the reviews and
    additionally finds the lemma of each word in the review

    :type records: list[dict]
    :param records: a list of dictionaries with the reviews
    """
    print('%s: lemmatize reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    record_index = 0
    for record in records:
        #

        tagged_words =\
            nlp_utils.lemmatize_text(record[Constants.TEXT_FIELD])

        record[Constants.POS_TAGS_FIELD] = tagged_words
        record_index += 1

    return records
github melqkiades / yelp / source / python / etl / reviews_preprocessor.py View on Github external
def pos_tag_reviews(records):
        print('%s: tag reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        tagger = PerceptronTagger()

        for record in records:
            tagged_words =\
                nlp_utils.tag_words(record[Constants.TEXT_FIELD], tagger)
            record[Constants.POS_TAGS_FIELD] = tagged_words
github melqkiades / yelp / source / python / topicmodeling / context / review_metrics_extractor.py View on Github external
:type record: dict
    :param record: the review that wants to be analyzed, it should contain the
    text of the review
    :rtype: list[float]
    :return: a list with numeric metrics
    """
    review_text = record[Constants.TEXT_FIELD]
    log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1)
    # log_time_words = math.log(len(self.get_time_words(review.text)) + 1)
    tagged_words = record[Constants.POS_TAGS_FIELD]
    log_words = math.log(len(tagged_words) + 1)
    counts = Counter(tag for word, tag, lemma in tagged_words)
    # print(counts)
    log_past_verbs = math.log(counts['VBD'] + 1)
    log_verbs = math.log(nlp_utils.count_verbs(counts) + 1)
    log_personal_pronouns = math.log(counts['PRP'] + 1)
    # log_sentences = float(len(get_sentences(review.text)) + 1)
    # log_words = float(len(get_words(review.text)) + 1)
    # log_time_words = float(len(self.get_time_words(review.text)) + 1)
    # tagged_words = review.tagged_words
    # counts = Counter(tag for word, tag in tagged_words)
    # log_past_verbs = float(counts['VBD'] + 1)
    # log_verbs = float(count_verbs(counts) + 1)
    # log_personal_pronouns = float(counts['PRP'] + 1)

    # This ensures that when log_verbs = 0 the program won't crash
    if log_verbs == 0:
        past_verbs_ratio = 0
    else:
        past_verbs_ratio = log_past_verbs / log_verbs
    # This ensures that when log_verbs = 0 the program won't crash