How to use the scattertext.whitespace_nlp_with_sentences function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_pair_plot_movies.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'
    )
)

file_name = 'movie_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
github JasonKessler / scattertext / demo_bow_pca.py View on Github external
from sklearn.decomposition import TruncatedSVD

import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)

corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus().select(ClassPercentageCompactor(term_count=3)))


html = st.produce_projection_explorer(corpus,
                                      embeddings=corpus.get_term_doc_mat(),
                                      projection_model=TruncatedSVD(n_components=30, n_iter=10),
                                      x_dim=0,
                                      y_dim=1,
                                      category='democrat',
                                      category_name='Democratic',
                                      not_category_name='Republican',
github JasonKessler / scattertext / demo_hedges_r.py View on Github external
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           nlp=whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())
html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=HedgesR(corpus),
    metadata=convention_df['speaker'],
    grey_threshold=0
)
file_name = 'demo_hedges_r.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
github JasonKessler / scattertext / demo_semiotic.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
#movie_df.category = movie_df.category.apply \
#(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
	movie_df,
	category_col='category',
	text_col='text',
	nlp=st.whitespace_nlp_with_sentences
).build()
corpus = corpus.get_unigram_corpus()

semiotic_square = st.SemioticSquare(
	corpus,
	category_a='fresh',
	category_b='rotten',
	neutral_categories=['plot'],
	scorer=st.RankDifference(),
	labels={'not_a_and_not_b': 'Plot Descriptions',
	        'a_and_b': 'Reviews',
	        'a_and_not_b': 'Positive',
	        'b_and_not_a': 'Negative',
	        'a':'',
	        'b':'',
	        'not_a':'',
github JasonKessler / scattertext / demo_four_square.py View on Github external
import time

import pandas as pd
import spacy

import scattertext as st

nlp = spacy.load('en', parser=False)
t0 = time.time()
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(st.whitespace_nlp_with_sentences)
full_corpus = (st.CorpusFromParsedDocuments(reviews_df,
                                            category_col='category',
                                            parsed_col='parse',
                                            #feats_from_spacy_doc=st.PhraseMachinePhrases()
                                            ).build())

term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus
          .keep_only_these_categories(['Accept, Positive', 'Accept, Negative',
                                       'Reject, Positive', 'Reject, Negative'],
                                      False)
          .get_unigram_corpus()
          .select(st.ClassPercentageCompactor(term_count=5)))

print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(full_corpus, starting_count=0.01)
github JasonKessler / scattertext / demo_cred_tfidf.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='category',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus().remove_categories(['plot'])

term_scorer = st.CredTFIDF(corpus).set_categories('fresh', ['rotten'])

print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf', ascending=False).head())

html = st.produce_frequency_explorer(
    corpus,
    category='fresh',
    not_category_name='rotten',
    term_scorer=term_scorer,
    metadata=corpus.get_df()['movie_name'],
    grey_threshold=0
)
file_name = 'demo_cred_tfidf.html'
open(file_name, 'wb').write(html.encode('utf-8'))
github norMNfan / Reddit-Bot-Classifier / classifier.py View on Github external
Y_posts_sub.append(0)

	data = np.empty([len(X_comments), 2], dtype=object)
	data[:, 0] = Y_comments
	data[:, 1] = X_comments

	for d in data:
		if d[0] == 0:
			d[0] = 'normal'
		else:
			d[0] = 'bot'

	df = pd.DataFrame({'label': data[:, 0], 'text':data[:, 1]})
	print(df)

	corpus = (st.CorpusFromPandas(df, category_col='label', text_col='text', nlp=st.whitespace_nlp_with_sentences)
		.build()
		.get_unigram_corpus()
		.compact(st.ClassPercentageCompactor(term_count=2, term_ranker=st.OncePerDocFrequencyRanker)))

	html = st.produce_characteristic_explorer(
		corpus,
		category='normal',
		category_name='Normal',
		not_category_name='Bot'
	)
	open('comment_text_chart.html', 'wb').write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_log_odds_ratio_prior_rotten_tomatoes.com.py View on Github external
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
	.build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
	.use_general_term_frequencies()
	.use_all_categories()
	.get_priors())
(open(fn, 'wb')
	.write(
	st.produce_fightin_words_explorer(
		corpus,
		category='fresh',
		not_categories=['rotten'],
		metadata=df['movie_name'],
		term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10),
github JasonKessler / scattertext / demo_embeddings_pca.py View on Github external
import scattertext as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus()
          .remove_infrequent_words(minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k = 3, maxiter=20000, which='LM')

x_dim = 0; y_dim = 1
projection = pd.DataFrame({'term':corpus.get_terms(),
                           'x':U.T[x_dim],
                           'y':U.T[y_dim]}).set_index('term')
html = st.produce_pca_explorer(corpus,
                               category='democrat',
                               category_name='Democratic',
github JasonKessler / scattertext / demo_pair_plot_convention_empath.py View on Github external
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()

html = scattertext.categoryprojector.pairplot.produce_pairplot(corpus,
                                                               use_metadata=True,
                                                               category_projector=st.CategoryProjector(selector=None),
                                                               topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
                                                               metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)