Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def lemmatize_sentences(records):
print('%s: lemmatize sentences' % time.strftime("%Y/%m/%d-%H:%M:%S"))
sentence_records = []
record_index = 0
document_level = Constants.DOCUMENT_LEVEL
for record in records:
sentences = \
nlp_utils.get_sentences(record[Constants.TEXT_FIELD])
sentence_index = 0
for sentence in sentences:
if isinstance(document_level, (int, float)) and\
sentence_index >= document_level:
break
tagged_words = nlp_utils.lemmatize_sentence(sentence)
sentence_record = {}
sentence_record.update(record)
sentence_record[Constants.TEXT_FIELD] = sentence
sentence_record['sentence_index'] = sentence_index
sentence_record[Constants.POS_TAGS_FIELD] = tagged_words
sentence_records.append(sentence_record)
sentence_index += 1
# print(sentence_record)
record_index += 1
# print('\rrecord index: %d/%d' % (record_index, len(records))),
def build_text_automatic(self, record):
text = record[Constants.TEXT_FIELD]
sentences = nlp_utils.get_sentences(text)
lemmatized_words = []
for sentence in sentences:
lemmatized_words.append(nlp_utils.lemmatize_sentence(
sentence, nltk.re.compile(''),
min_length=1, max_length=100))
doc_parts = []
itemize = Itemize()
for sentence in lemmatized_words:
new_words = []
itemize.add_item('')
for tagged_word in sentence:
tag = tagged_word[1]
word = tagged_word[0]
singular = pattern.text.en.singularize(word)
def get_review_metrics(record):
"""
Returns a list with the metrics of a review. This list is composed
in the following way: [log(num_sentences + 1), log(num_words + 1),
log(num_past_verbs + 1), log(num_verbs + 1),
(log(num_past_verbs + 1) / log(num_verbs + 1))
:type record: dict
:param record: the review that wants to be analyzed, it should contain the
text of the review
:rtype: list[float]
:return: a list with numeric metrics
"""
review_text = record[Constants.TEXT_FIELD]
log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1)
# log_time_words = math.log(len(self.get_time_words(review.text)) + 1)
tagged_words = record[Constants.POS_TAGS_FIELD]
log_words = math.log(len(tagged_words) + 1)
counts = Counter(tag for word, tag, lemma in tagged_words)
# print(counts)
log_past_verbs = math.log(counts['VBD'] + 1)
log_verbs = math.log(nlp_utils.count_verbs(counts) + 1)
log_personal_pronouns = math.log(counts['PRP'] + 1)
# log_sentences = float(len(get_sentences(review.text)) + 1)
# log_words = float(len(get_words(review.text)) + 1)
# log_time_words = float(len(self.get_time_words(review.text)) + 1)
# tagged_words = review.tagged_words
# counts = Counter(tag for word, tag in tagged_words)
# log_past_verbs = float(counts['VBD'] + 1)
# log_verbs = float(count_verbs(counts) + 1)
# log_personal_pronouns = float(counts['PRP'] + 1)
def lemmatize_reviews(records):
"""
Performs a POS tagging on the text contained in the reviews and
additionally finds the lemma of each word in the review
:type records: list[dict]
:param records: a list of dictionaries with the reviews
"""
print('%s: lemmatize reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
record_index = 0
for record in records:
#
tagged_words =\
nlp_utils.lemmatize_text(record[Constants.TEXT_FIELD])
record[Constants.POS_TAGS_FIELD] = tagged_words
record_index += 1
return records
# print('')
def lemmatize_reviews(records):
"""
Performs a POS tagging on the text contained in the reviews and
additionally finds the lemma of each word in the review
:type records: list[dict]
:param records: a list of dictionaries with the reviews
"""
print('%s: lemmatize reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
record_index = 0
for record in records:
#
tagged_words =\
nlp_utils.lemmatize_text(record[Constants.TEXT_FIELD])
record[Constants.POS_TAGS_FIELD] = tagged_words
record_index += 1
return records
def pos_tag_reviews(records):
print('%s: tag reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
tagger = PerceptronTagger()
for record in records:
tagged_words =\
nlp_utils.tag_words(record[Constants.TEXT_FIELD], tagger)
record[Constants.POS_TAGS_FIELD] = tagged_words
:type record: dict
:param record: the review that wants to be analyzed, it should contain the
text of the review
:rtype: list[float]
:return: a list with numeric metrics
"""
review_text = record[Constants.TEXT_FIELD]
log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1)
# log_time_words = math.log(len(self.get_time_words(review.text)) + 1)
tagged_words = record[Constants.POS_TAGS_FIELD]
log_words = math.log(len(tagged_words) + 1)
counts = Counter(tag for word, tag, lemma in tagged_words)
# print(counts)
log_past_verbs = math.log(counts['VBD'] + 1)
log_verbs = math.log(nlp_utils.count_verbs(counts) + 1)
log_personal_pronouns = math.log(counts['PRP'] + 1)
# log_sentences = float(len(get_sentences(review.text)) + 1)
# log_words = float(len(get_words(review.text)) + 1)
# log_time_words = float(len(self.get_time_words(review.text)) + 1)
# tagged_words = review.tagged_words
# counts = Counter(tag for word, tag in tagged_words)
# log_past_verbs = float(counts['VBD'] + 1)
# log_verbs = float(count_verbs(counts) + 1)
# log_personal_pronouns = float(counts['PRP'] + 1)
# This ensures that when log_verbs = 0 the program won't crash
if log_verbs == 0:
past_verbs_ratio = 0
else:
past_verbs_ratio = log_past_verbs / log_verbs
# This ensures that when log_verbs = 0 the program won't crash