Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
movie_df,
category_col='movie_name',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()
html = st.produce_pairplot(
corpus,
category_projection=st.get_optimal_category_projection(corpus, verbose=True),
metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
d3_url_struct=st.D3URLs(
d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
d3_url='scattertext/data/viz/scripts/d3.min.js'
)
)
file_name = 'movie_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
from sklearn.decomposition import TruncatedSVD
import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus().select(ClassPercentageCompactor(term_count=3)))
html = st.produce_projection_explorer(corpus,
embeddings=corpus.get_term_doc_mat(),
projection_model=TruncatedSVD(n_components=30, n_iter=10),
x_dim=0,
y_dim=1,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus())
html = produce_frequency_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
term_scorer=HedgesR(corpus),
metadata=convention_df['speaker'],
grey_threshold=0
)
file_name = 'demo_hedges_r.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
#movie_df.category = movie_df.category.apply \
#(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build()
corpus = corpus.get_unigram_corpus()
semiotic_square = st.SemioticSquare(
corpus,
category_a='fresh',
category_b='rotten',
neutral_categories=['plot'],
scorer=st.RankDifference(),
labels={'not_a_and_not_b': 'Plot Descriptions',
'a_and_b': 'Reviews',
'a_and_not_b': 'Positive',
'b_and_not_a': 'Negative',
'a':'',
'b':'',
'not_a':'',
import time
import pandas as pd
import spacy
import scattertext as st
nlp = spacy.load('en', parser=False)
t0 = time.time()
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(st.whitespace_nlp_with_sentences)
full_corpus = (st.CorpusFromParsedDocuments(reviews_df,
category_col='category',
parsed_col='parse',
#feats_from_spacy_doc=st.PhraseMachinePhrases()
).build())
term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus
.keep_only_these_categories(['Accept, Positive', 'Accept, Negative',
'Reject, Positive', 'Reject, Negative'],
False)
.get_unigram_corpus()
.select(st.ClassPercentageCompactor(term_count=5)))
print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(full_corpus, starting_count=0.01)
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = st.CorpusFromPandas(
movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus().remove_categories(['plot'])
term_scorer = st.CredTFIDF(corpus).set_categories('fresh', ['rotten'])
print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf', ascending=False).head())
html = st.produce_frequency_explorer(
corpus,
category='fresh',
not_category_name='rotten',
term_scorer=term_scorer,
metadata=corpus.get_df()['movie_name'],
grey_threshold=0
)
file_name = 'demo_cred_tfidf.html'
open(file_name, 'wb').write(html.encode('utf-8'))
Y_posts_sub.append(0)
data = np.empty([len(X_comments), 2], dtype=object)
data[:, 0] = Y_comments
data[:, 1] = X_comments
for d in data:
if d[0] == 0:
d[0] = 'normal'
else:
d[0] = 'bot'
df = pd.DataFrame({'label': data[:, 0], 'text':data[:, 1]})
print(df)
corpus = (st.CorpusFromPandas(df, category_col='label', text_col='text', nlp=st.whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus()
.compact(st.ClassPercentageCompactor(term_count=2, term_ranker=st.OncePerDocFrequencyRanker)))
html = st.produce_characteristic_explorer(
corpus,
category='normal',
category_name='Normal',
not_category_name='Bot'
)
open('comment_text_chart.html', 'wb').write(html.encode('utf-8'))
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior
fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build())
priors = (st.PriorFactory(corpus,
category='fresh',
not_categories=['rotten'],
starting_count=1)
.use_general_term_frequencies()
.use_all_categories()
.get_priors())
(open(fn, 'wb')
.write(
st.produce_fightin_words_explorer(
corpus,
category='fresh',
not_categories=['rotten'],
metadata=df['movie_name'],
term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10),
import scattertext as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus()
.remove_infrequent_words(minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k = 3, maxiter=20000, which='LM')
x_dim = 0; y_dim = 1
projection = pd.DataFrame({'term':corpus.get_terms(),
'x':U.T[x_dim],
'y':U.T[y_dim]}).set_index('term')
html = st.produce_pca_explorer(corpus,
category='democrat',
category_name='Democratic',
import scattertext as st
import scattertext.categoryprojector.pairplot
convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()
corpus = st.CorpusFromPandas(
convention_df,
category_col='speaker',
text_col='text',
nlp=st.whitespace_nlp_with_sentences,
feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(corpus,
use_metadata=True,
category_projector=st.CategoryProjector(selector=None),
topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
metadata=convention_df['party'] + ': ' + convention_df['speaker'])
file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)