How to use the nltk.sent_tokenize function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github windweller / DisExtract / preprocessing / wikitext.py View on Github external
raise Exception("not implemented")

    sentences = {marker: {"sentence": [], "previous": []} for marker in EN_DISCOURSE_MARKERS}
    
    for filename in filenames:
        print("reading {}".format(filename))
        file_path = pjoin(source_dir, "orig", filename)
        with io.open(file_path, 'rU', encoding="utf-8") as f:
            # tokenize sentences
            sentences_cache_file = file_path + ".CACHE_SENTS"
            if caching and os.path.isfile(sentences_cache_file):
                sent_list = pickle.load(open(sentences_cache_file, "rb"))
            else:
                tokens = f.read().replace("\n", ". ")
                print("tokenizing")
                sent_list = nltk.sent_tokenize(tokens)
                if caching:
                    pickle.dump(sent_list, open(sentences_cache_file, "wb"))

        # check each sentence for discourse markers
        previous_sentence = ""
        for sentence in sent_list:
            words = rephrase(sentence).split()  # replace "for example"
            for marker in EN_DISCOURSE_MARKERS:
                if marker == "for example":
                    proxy_marker = "for_example" 
                else:
                    proxy_marker = marker

                if proxy_marker in [w.lower() for w in words]:
                    sentences[marker]["sentence"].append(sentence)
                    sentences[marker]["previous"].append(previous_sentence)
github gau820827 / AI-writer_Data2Doc / evaluate / data_utils.py View on Github external
def append_candidate_rels(entry, summ, all_ents, prons, players, teams, cities, candrels):
    """
    appends tuples of form (sentence_tokens, [rels]) to candrels
    """
    sents = sent_tokenize(summ)
    for j, sent in enumerate(sents):
        #tokes = word_tokenize(sent)
        tokes = sent.split()
        ents = extract_entities(tokes, all_ents, prons)
        nums = extract_numbers(tokes)
        rels = get_rels(entry, ents, nums, players, teams, cities)
        if len(rels) > 0:
            candrels.append((tokes, rels))
    return candrels
github kaustubhhiware / facebook-archive / wordclouds.py View on Github external
if not os.path.isfile(fname):
        print("The file friends.json is not present at the entered location.")
        exit(1)
    with open(fname) as f:
        base_data = json.load(f)
    base_data = base_data["friends"]
    for ele in base_data:
        fwords = word_tokenize(ele["name"])
        if fwords[0]!="Md" and fwords[0]!="Kumar":
            flist.append(fwords[0])
        else:
            flist.append(fwords[1])
            
    if final_comments!="":
        friend_names = ""
        for sent in nltk.sent_tokenize(final_comments):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    if(chunk.label()[0]=='P'):
                        if ''.join(c[0] for c in chunk.leaves()) in flist:
                            friend_names = friend_names + " " + ' '.join(c[0] for c in chunk.leaves())

        wordcloud = WordCloud(background_color = "white", mask = mask,relative_scaling = 1.0,
                          stopwords = set(STOPWORDS)
                          ).generate(friend_names)

        plt.imshow(wordcloud)
        plt.axis("off")
        print("WordCloud of Your friends mostly tagged by you")
        plt.show()
    else:
        print("No Comments and Posts Text Found")
github thallada / nlp / syntax_aware_generate.py View on Github external
def make_cfd(text, n, cfd=None, exclude_punctuation=True, case_insensitive=True):
    if not cfd:
        cfd = {}
    if exclude_punctuation:
        nopunct = re.compile('^\w+$')
    sentences = nltk.sent_tokenize(text)
    for sent in sentences:
        sent = nltk.word_tokenize(sent)
        if case_insensitive:
            sent = [word.lower() for word in sent]
        if exclude_punctuation:
            sent = [word for word in sent if nopunct.match(word)]
        for i in range(len(sent) - (n - 1)):
            condition = ' '.join(sent[i:(i + n) - 1])
            sample = sent[(i + n) - 1]
            if condition in cfd:
                if sample in cfd[condition]:
                    cfd[condition][sample] += 1
                else:
                    cfd[condition].update({sample: 1})
            else:
                cfd[condition] = {sample: 1}
github VeritasAuthorship / veritas / models / sentence_wise_classification.py View on Github external
def expand(example):
    return [Example(sentence, example.author) for sentence in sent_tokenize(example.passage)]
github foxbook / atap / snippets / ch03 / preprocess.py View on Github external
def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ]
github tyarkoni / pliers / pliers / stimuli / text.py View on Github external
def tokenize_text(text):
                if unit == 'word':
                    return nltk.word_tokenize(text, language)
                elif unit.startswith('sent'):
                    return nltk.sent_tokenize(text, language)
                else:
                    raise ValueError(
                        "unit must be either 'word' or 'sentence'")
github jiangnanhugo / SimpleRNN / RNN_Encoder_Decoder / util.py View on Github external
def load_data(filename="data/reddit-comments-2015-08.csv",vocabulary_size=2000,min_sent_characters=0):

    # Read the data
    print("Reading CSV file...")
    with open(filename,'rt') as f:
        reader=csv.reader(f,skipinitialspace=True)
        reader.next()
        # Split full comments into sentences
        sentences=itertools.chain(*[nltk.sent_tokenize(x[0].decode("utf-8").lower()) for x in reader])
        # Filter sentences
        sentences=[s for s in sentences if len(s)>=min_sent_characters]
        sentences=[s for s in sentences if "http" not in s]

    print("parsed %d sentences." %(len(sentences)))

    # Tokenize the sentences into words
    tokenized_sentences=[nltk.word_tokenize(sent) for sent in sentences]

    # Count the word frequencies
    word_freq=nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print("Found %d unique word tokens." %len(word_freq.items()))

    # Get the most common words and build index2word and word2index vectors
    vocab=sorted(word_freq.items(),key=lambda x:(x[1],x[0]),reverse=True)[:vocabulary_size-1]
    print ("Using vocabulary size %d." % vocabulary_size)
github robincamille / nanogenmo2016 / excerpt.py View on Github external
pass
        elif w[0] == ' ':
            w = w[1:]
            exfin.append(w)
        elif w[0] == '[':
            w = w[1:]
            exfin.append(w)
        elif w[-1] == ']':
            w = w[:-1]
            exfin.append(w)
        else:
            exfin.append(w)
    ex = ' '.join(exfin)
    
    # split into sentences
    exs = tok(ex)

    # start with second sentence, end with second-to-last
    if exs[1][:2] == '" ': # skip initial quotation mark if any
        exs[1] = exs[1][2:]
    blurb = '> ... ' + (' '.join(exs[1:-1])) + ' ...'
    
    outtro = (leave[randint(0,len(leave)-1)])

    usedbooktitles.append(title)
    titlelink = '[' + title + '](' + guturl + ')'
    usedbooktitlesandlinks.append(titlelink)
    
    return ' '.join(intro)[1:], blurb, outtro
github allenai / bi-att-flow / augment_squad.py View on Github external
def _augment(ih, dict_, key, is_doc):
    assert isinstance(ih, CoreNLPInterface)
    content = dict_[key]
    if is_doc:
        sents = nltk.sent_tokenize(content)
    else:
        sents = [content]
    # words = list(map(ih.split_sent, sents))
    const = list(map(ih.get_const, sents))
    dep = list(map(ih.get_dep, sents))
    if not is_doc:
        const = const[0]
        dep = dep[0]
    dict_["{}_const".format(key)] = const
    dict_["{}_dep".format(key)] = dep
    if is_doc:
        return sum(each is None for each in dep)
    return int(dep is None)