Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def sentence_split(self, document):
"""split document into sentences"""
lines = document.split('\n')
if self.presplit_sentences:
return [line for line in lines if line]
rtn = []
for line in lines:
if line != '':
rtn.extend(tokenize.sent_tokenize(line))
return rtn
def get_sentences(data):
return [sent for line in re.split("[\n,،]+", data) if line for sent in sent_tokenize(line.strip()) if sent]
#return [sent for line in data.split('\n') if line for sent in sent_tokenize(line) if sent]
authorFileNames = os.listdir(directory)
author = {}
for file in authorFileNames:
if file.endswith(".xml"):
file_path = directory+"/"+file
xmldoc = minidom.parse(file_path)
rawdocuments = xmldoc.getElementsByTagName('document')
length = 0
vocabulary = set()
text=""
no_of_sentences=0
for document in rawdocuments:
text = removeTag_CDATA_section(document.firstChild.nodeValue.strip())
sentences = nltk.tokenize.sent_tokenize(text)
for sentence in sentences:
length = length + len(sentence.split())
if vocabulary:
vocabulary.update(set(text.split()))
else:
vocabulary = set(text.split())
no_of_sentences = no_of_sentences+len(sentences)
if vocabulary:
author[file.split('.')[0]] = [length,len(vocabulary),no_of_sentences]
else:
author[file.split('.')[0]] = [length,0,no_of_sentences]
ifile = open('truth.txt')
truth_data = ifile.readlines()
def split(self, dataset):
"""
:type dataset: structures.data.Dataset
"""
for part in dataset.parts():
part.sentences = sent_tokenize(part.text)
def read_dataset(paths):
for p in paths:
with open(p) as fin:
text = fin.read()
for s in sent_tokenize(text):
words = word_tokenize(s)
yield [w for w in words if w not in _stop_words]
def preprocessing(self):
"""Preprocessing of the text to make it more resonant for training
"""
paras = []
labels = []
texts = []
for idx in range(self.text.shape[0]):
text = self.clean_string(self.text[idx])
texts.append(text)
sentences = tokenize.sent_tokenize(text)
paras.append(sentences)
tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
tokenizer.fit_on_texts(texts)
data = np.zeros((len(texts), self.max_senten_num,
self.max_senten_len), dtype='int32')
for i, sentences in enumerate(paras):
for j, sent in enumerate(sentences):
if j < self.max_senten_num:
wordTokens = text_to_word_sequence(sent)
k = 0
for _, word in enumerate(wordTokens):
if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[word] < self.max_features:
data[i, j, k] = tokenizer.word_index[word]
k = k+1
self.word_index = tokenizer.word_index
if self.verbose == 1:
def count_sents_in_quotes(text):
v=len(nltk.tokenize.sent_tokenize(text))
if v>1 or v==0:
return v
if text[-1]=='.' or text[-1]=='!' or text[-1]=='.' :
return 1
return 0
result = dict()
result["text"] = ""
result["stats"] = dict()
result["stats"]["relevant_words"] = []
result["stats"]["word_length"] = 0
result["stats"]["avg_contrast"] = ""
result["stats"]["avg_current"] = ""
result["stats"]["totalSummaries"] = 0
news = text
summarySize = 0; # Store size of summary to retrieve stats
# RegexpTokenizer used to avoid punctuation signs
tokenizer = RegexpTokenizer(r"[a-zA-Z_']+")
words = tokenizer.tokenize(news)
sentences = sent_tokenize(news)
# Retrieve set to remove stopwords from analysis
stopWords = set(stopwords.words("english"))
# Use stemmers in the future, maybe run the code with both and retrieve most efficient
ps = PorterStemmer()
pss = SnowballStemmer("english")
freq = dict() # Frequency array for words
sentenceVal = dict() # Number of instances a word from freq is contained in a sentence
for w in words:
w = w.lower()
if w in stopWords:
continue
if w in freq:
freq[w] += 1
def __process_sentences(self, v) -> List[str]:
sentence = tokenize.sent_tokenize(v)
return sentence
def extract(self, text):
'''Return a list of noun phrases (strings) for body of text.'''
sentences = nltk.tokenize.sent_tokenize(text)
noun_phrases = []
for sentence in sentences:
parsed = self._parse_sentence(sentence)
# Get the string representation of each subtree that is a
# noun phrase tree
phrases = [_normalize_tags(filter_insignificant(each,
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
if isinstance(each, nltk.tree.Tree) and each.label()
== 'NP' and len(filter_insignificant(each)) >= 1
and _is_match(each, cfg=self.CFG)]
nps = [tree2str(phrase) for phrase in phrases]
noun_phrases.extend(nps)
return noun_phrases