Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
raise Exception("not implemented")
sentences = {marker: {"sentence": [], "previous": []} for marker in EN_DISCOURSE_MARKERS}
for filename in filenames:
print("reading {}".format(filename))
file_path = pjoin(source_dir, "orig", filename)
with io.open(file_path, 'rU', encoding="utf-8") as f:
# tokenize sentences
sentences_cache_file = file_path + ".CACHE_SENTS"
if caching and os.path.isfile(sentences_cache_file):
sent_list = pickle.load(open(sentences_cache_file, "rb"))
else:
tokens = f.read().replace("\n", ". ")
print("tokenizing")
sent_list = nltk.sent_tokenize(tokens)
if caching:
pickle.dump(sent_list, open(sentences_cache_file, "wb"))
# check each sentence for discourse markers
previous_sentence = ""
for sentence in sent_list:
words = rephrase(sentence).split() # replace "for example"
for marker in EN_DISCOURSE_MARKERS:
if marker == "for example":
proxy_marker = "for_example"
else:
proxy_marker = marker
if proxy_marker in [w.lower() for w in words]:
sentences[marker]["sentence"].append(sentence)
sentences[marker]["previous"].append(previous_sentence)
def append_candidate_rels(entry, summ, all_ents, prons, players, teams, cities, candrels):
"""
appends tuples of form (sentence_tokens, [rels]) to candrels
"""
sents = sent_tokenize(summ)
for j, sent in enumerate(sents):
#tokes = word_tokenize(sent)
tokes = sent.split()
ents = extract_entities(tokes, all_ents, prons)
nums = extract_numbers(tokes)
rels = get_rels(entry, ents, nums, players, teams, cities)
if len(rels) > 0:
candrels.append((tokes, rels))
return candrels
if not os.path.isfile(fname):
print("The file friends.json is not present at the entered location.")
exit(1)
with open(fname) as f:
base_data = json.load(f)
base_data = base_data["friends"]
for ele in base_data:
fwords = word_tokenize(ele["name"])
if fwords[0]!="Md" and fwords[0]!="Kumar":
flist.append(fwords[0])
else:
flist.append(fwords[1])
if final_comments!="":
friend_names = ""
for sent in nltk.sent_tokenize(final_comments):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
if(chunk.label()[0]=='P'):
if ''.join(c[0] for c in chunk.leaves()) in flist:
friend_names = friend_names + " " + ' '.join(c[0] for c in chunk.leaves())
wordcloud = WordCloud(background_color = "white", mask = mask,relative_scaling = 1.0,
stopwords = set(STOPWORDS)
).generate(friend_names)
plt.imshow(wordcloud)
plt.axis("off")
print("WordCloud of Your friends mostly tagged by you")
plt.show()
else:
print("No Comments and Posts Text Found")
def make_cfd(text, n, cfd=None, exclude_punctuation=True, case_insensitive=True):
if not cfd:
cfd = {}
if exclude_punctuation:
nopunct = re.compile('^\w+$')
sentences = nltk.sent_tokenize(text)
for sent in sentences:
sent = nltk.word_tokenize(sent)
if case_insensitive:
sent = [word.lower() for word in sent]
if exclude_punctuation:
sent = [word for word in sent if nopunct.match(word)]
for i in range(len(sent) - (n - 1)):
condition = ' '.join(sent[i:(i + n) - 1])
sample = sent[(i + n) - 1]
if condition in cfd:
if sample in cfd[condition]:
cfd[condition][sample] += 1
else:
cfd[condition].update({sample: 1})
else:
cfd[condition] = {sample: 1}
def expand(example):
return [Example(sentence, example.author) for sentence in sent_tokenize(example.passage)]
def tokenize(self, fileid):
"""
Segments, tokenizes, and tags a document in the corpus. Returns a
generator of paragraphs, which are lists of sentences, which in turn
are lists of part of speech tagged words.
"""
for paragraph in self.corpus.paras(fileids=fileid):
yield [
pos_tag(wordpunct_tokenize(sent))
for sent in sent_tokenize(paragraph)
]
def tokenize_text(text):
if unit == 'word':
return nltk.word_tokenize(text, language)
elif unit.startswith('sent'):
return nltk.sent_tokenize(text, language)
else:
raise ValueError(
"unit must be either 'word' or 'sentence'")
def load_data(filename="data/reddit-comments-2015-08.csv",vocabulary_size=2000,min_sent_characters=0):
# Read the data
print("Reading CSV file...")
with open(filename,'rt') as f:
reader=csv.reader(f,skipinitialspace=True)
reader.next()
# Split full comments into sentences
sentences=itertools.chain(*[nltk.sent_tokenize(x[0].decode("utf-8").lower()) for x in reader])
# Filter sentences
sentences=[s for s in sentences if len(s)>=min_sent_characters]
sentences=[s for s in sentences if "http" not in s]
print("parsed %d sentences." %(len(sentences)))
# Tokenize the sentences into words
tokenized_sentences=[nltk.word_tokenize(sent) for sent in sentences]
# Count the word frequencies
word_freq=nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique word tokens." %len(word_freq.items()))
# Get the most common words and build index2word and word2index vectors
vocab=sorted(word_freq.items(),key=lambda x:(x[1],x[0]),reverse=True)[:vocabulary_size-1]
print ("Using vocabulary size %d." % vocabulary_size)
pass
elif w[0] == ' ':
w = w[1:]
exfin.append(w)
elif w[0] == '[':
w = w[1:]
exfin.append(w)
elif w[-1] == ']':
w = w[:-1]
exfin.append(w)
else:
exfin.append(w)
ex = ' '.join(exfin)
# split into sentences
exs = tok(ex)
# start with second sentence, end with second-to-last
if exs[1][:2] == '" ': # skip initial quotation mark if any
exs[1] = exs[1][2:]
blurb = '> ... ' + (' '.join(exs[1:-1])) + ' ...'
outtro = (leave[randint(0,len(leave)-1)])
usedbooktitles.append(title)
titlelink = '[' + title + '](' + guturl + ')'
usedbooktitlesandlinks.append(titlelink)
return ' '.join(intro)[1:], blurb, outtro
def _augment(ih, dict_, key, is_doc):
assert isinstance(ih, CoreNLPInterface)
content = dict_[key]
if is_doc:
sents = nltk.sent_tokenize(content)
else:
sents = [content]
# words = list(map(ih.split_sent, sents))
const = list(map(ih.get_const, sents))
dep = list(map(ih.get_dep, sents))
if not is_doc:
const = const[0]
dep = dep[0]
dict_["{}_const".format(key)] = const
dict_["{}_dep".format(key)] = dep
if is_doc:
return sum(each is None for each in dep)
return int(dep is None)