How to use the scattertext.CorpusFromPandas.CorpusFromPandas function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_expected_vs_actual.py View on Github external
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()

def scale(ar):
	return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
github JasonKessler / scattertext / demo_insignificant_greyed_out.py View on Github external
import spacy

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.LogOddsUniformativePriorScore import LogOddsUninformativePriorScore

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
scores = -(LogOddsUninformativePriorScore
           .get_thresholded_score(term_freq_df['democrat freq'],
                                  term_freq_df['republican freq'],
                                  alpha_w=2.,
                                  threshold=0.1))
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    scores=scores,
                                    sort_by_dist=False,
                                    gray_zero_scores=True,
github JasonKessler / scattertext / simple.py View on Github external
import spacy

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext import produce_scattertext_html
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_html(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,
                                    width_in_pixels=1000)
open('./simple.html', 'wb').write(html.encode('utf-8'))
print('Open ./simple.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_general_inquirer.py View on Github external
def main():
	convention_df = SampleCorpora.ConventionData2012.get_data()
	feat_builder = FeatsFromGeneralInquirer()
	corpus = CorpusFromPandas(convention_df,
	                          category_col='party',
	                          text_col='text',
	                          nlp=whitespace_nlp_with_sentences,
	                          feats_from_spacy_doc=feat_builder).build()
	html = produce_scattertext_explorer(corpus,
	                                    category='democrat',
	                                    category_name='Democratic',
	                                    not_category_name='Republican',
	                                    width_in_pixels=1000,
	                                    metadata=convention_df['speaker'],
	                                    use_non_text_features=True,
	                                    use_full_doc=True,
	                                    topic_model_term_lists=feat_builder.get_top_model_term_lists(),
										metadata_descriptions=feat_builder.get_definitions()
										)
	open('./demo_general_inquirer.html', 'wb').write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_without_spacy.py View on Github external
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp

nlp = whitespace_nlp

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_without_spacy.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_without_spacy.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_hedges_r.py View on Github external
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           nlp=whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())
html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=HedgesR(corpus),
    metadata=convention_df['speaker'],
    grey_threshold=0
)
file_name = 'demo_hedges_r.html'
open(file_name, 'wb').write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_similarity.py View on Github external
def main():
	nlp = spacy.load('en')
	convention_df = SampleCorpora.ConventionData2012.get_data()
	corpus = CorpusFromPandas(convention_df,
	                          category_col='party',
	                          text_col='text',
	                          nlp=nlp).build()
	html = word_similarity_explorer(corpus,
	                                category='democrat',
	                                category_name='Democratic',
	                                not_category_name='Republican',
	                                target_term='jobs',
	                                minimum_term_frequency=5,
	                                width_in_pixels=1000,
	                                metadata=convention_df['speaker'],
	                                alpha=0.01,
	                                max_p_val=0.1,
	                                save_svg_button=True)
	open('./demo_similarity.html', 'wb').write(html.encode('utf-8'))
	print('Open ./demo_similarlity.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_scaled_f_score.py View on Github external
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.ScaledFScore import ScaledFScorePresetsNeg1To1

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus()
html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  minimum_term_frequency=5,
                                  width_in_pixels=1000,
                                  term_scorer=ScaledFScorePresetsNeg1To1(
	                                      beta=1,
	                                      scaler_algo='normcdf'
                                      ),
                                  grey_threshold=0,
                                  y_axis_values=[-1, 0, 1],
                                  metadata=convention_df['speaker'])
github JasonKessler / scattertext / demo_sparse.py View on Github external
import spacy
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, sparse_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()
scores = corpus.get_logreg_coefs('democrat',
                                 LogisticRegression(penalty='l1', C=10, max_iter=10000, n_jobs=-1))
html = sparse_explorer(corpus,
                       category='democrat',
                       scores=scores,
                       category_name='Democratic',
                       not_category_name='Republican',
                       minimum_term_frequency=5,
                       width_in_pixels=1000,
                       metadata=convention_df['speaker'])
open('./demo_sparse.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_sparse.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_phrase_machine.py View on Github external
import spacy

from scattertext import SampleCorpora, PhraseMachinePhrases, dense_rank, RankDifference, AssociationCompactor, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           feats_from_spacy_doc=PhraseMachinePhrases(),
                           nlp=spacy.load('en', parser=False))
          .build().compact(AssociationCompactor(4000)))

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=0,
                                    pmi_threshold_coefficient=0,
                                    transform=dense_rank,
                                    metadata=corpus.get_df()['speaker'],
                                    term_scorer=RankDifference(),
                                    width_in_pixels=1000)