Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
my_columns = ["choose_one", "text", "none"]
df = pd.DataFrame(arr, columns=my_columns)
df['choose_one:confidence'] = df['choose_one'].map(
lambda x: 1 if x == "Not Relevant" or x == "Relevant" else 0.5)
elif disaster_type[ij] == "fire":
dimensions = 350
stem_map_high = json.load(open('./data/disasters/classify/fire_stem_map_high.json'))
stem_map_low = json.load(open('./data/disasters/classify/fire_stem_map_low.json'))
low_2_high_map = json.load(open('./data/disasters/classify/fire_low_2_high_map.json'))
word2vec_flag = 1
dictionary = corpora.Dictionary.load('./data/disasters/classify/fire_model.dict')
tfidf = models.TfidfModel.load('./data/disasters/classify/fire_model.tfidf')
lsi = models.LsiModel.load('./data/disasters/classify/fire_model.lsi')
model_flag = 1
input_file = "./data/disasters/classify/fire.csv"
arr = []
with open(input_file) as f:
for line in f:
a = [x.strip() for x in line.split(',')]
arr.append(a)
# df = np.array(arr)
# df = pd.read_csv(input_file, encoding="ISO-8859-1", delimiter=",")
my_columns = ["choose_one", "text", "none"]
parser.add_argument('-t', '--topology_file', required=True, action='store', dest='top_file', help='Location of topology file')
parser.add_argument('-p', '--dir_prefix', choices=['clique', 'community'], required=True, action='store', dest='dir_prefix', help='Select whether the topology contains cliques or communities')
parser.add_argument('-w', '--working_dir', required=True, action='store', dest='working_dir', help='Name of the directory you want to direct output to')
parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of the saved LDA model')
parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary for the model')
parser.add_argument('-u', '--unseen_docs', required=True, action='store', dest='unseen_docs', help='Directory containing unseen documents')
parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
argcomplete.autocomplete(parser)
args = parser.parse_args()
output_dir = os.path.join(args.working_dir, '')
if not os.path.exists(os.path.dirname(output_dir)):
os.makedirs(os.path.dirname(output_dir), 0o755)
# load dictionary
model_dict = corpora.Dictionary.load(args.dict_loc)
# load trained model from file
lda = models.LdaModel.load(args.lda_loc)
write_topn_words(output_dir, lda)
# create a set of all users from topology file
with open(args.top_file, 'r') as inp_file:
users = set(str(user) for community in inp_file for user in ast.literal_eval(community))
# opens up a 'job in progress' if ran this program and stopped it
try:
with open(output_dir + 'document_vectors.json', 'r') as all_community_file:
document_vectors = json.load(all_community_file)
except:
document_vectors = {}
# use multiprocessing to query document vectors
# this is for corpus data
ZHIHU_ITEM_PATH = './zhihu_dat/item.dat'
ZHIHU_USER_PATH = './zhihu_dat/users.dat'
# this for adj data
ZHIHU_ITEM_ADJ = './zhihu_dat/item_adj.dat'
ZHIHU_USER_ADJ = './zhihu_dat/user_adj.dat'
# this for truth data
ZHIHU_TRUTH_ADJ = './zhihu_dat/truth.dat'
# this for user profile
ZHIHU_USER_Q_NUMBER = './zhihu_dat/user_q_num.dat'
ZHIHU_USER_Q_SCORE = './zhihu_dat/user_q_score.dat'
dictionary = corpora.Dictionary.load(ZHIHU_DICT_PATH)
# formateed user topic dat
ZHIHU_USER_TOPIC_PATH = './zhihu_dat/zhihu_user_topic.dat'
cat
will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
also return the reversed-vocabulary ["dog", "cat"].
Args:
vocabulary_path: path to the file containing the vocabulary.
Returns:
a pair: the vocabulary (a dictionary mapping string to integers), and
the reversed vocabulary (a list, which reverses the vocabulary mapping).
Raises:
ValueError: if the provided vocabulary_path does not exist.
"""
if gfile.Exists(vocabulary_path):
vocab = corpora.Dictionary.load(vocabulary_path)
return vocab.token2id, vocab.token2id.keys()
else:
raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def load_dictionaries(self, topicDict=None, opinionDict=None):
if topicDict:
self.topicDictionary = corpora.Dictionary.load(topicDict)
logger.info('topic dictionary {}'.format(self.topicDictionary))
if opinionDict:
self.opinionDictionary = corpora.Dictionary.load(opinionDict)
logger.info('opinion dictionary {}'.format(self.opinionDictionary))
def init_from_files(cls, topic_model_fname, gensim_dict_fname, lr_dict_fname,
*args, **kwargs):
topic_modeler = models.ldamodel.LdaModel.load(topic_model_fname)
gensim_dict = corpora.Dictionary.load(gensim_dict_fname)
lr_dict = joblib.load(lr_dict_fname)
return cls(topic_modeler, gensim_dict, lr_dict, *args, **kwargs)
#!/usr/bin/env python
import logging
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities, matutils
from stoplist import stoplist
from vstore import VStore
import digestion
SOURCE = "wiki3"
### Create the corpus out of the documents
if os.path.exists(SOURCE+'.corpus.mm'):
# Query mode
unidict = corpora.Dictionary.load(SOURCE = ".dict")
unilsi = models.LsiModel.load(SOURCE + '.unilsimodel')
uniindex = similarities.MatrixSimilarity.load(SOURCE + ".matsim")
else:
# Index mode
# collect statistics about all tokens
unidict = digestion.line_dict(SOURCE);
filter_ids = set(unidict.token2id[stopword] for stopword in stoplist
if stopword in unidict.token2id) # stopwords
filter_ids.update(set([unidict.token2id[fragment] for fragment in unidict.token2id
if len(fragment) == 1])) # short words
filter_ids.update(set([tokenid for tokenid, docfreq in unidict.dfs.iteritems()
if docfreq == 1])) # hepax legomena
unidict.filter_tokens(filter_ids) # remove stop words and words that appear only once
unidict.compactify() # remove gaps in id sequence after words that were removed
unidict.save(SOURCE + '.dict')
print(unidict)
def __init__(self, oblige_fit, path):
super().__init__(oblige_fit, path)
with open(path + 'tags_embs.pkl', 'rb') as file:
self.embs = pickle.load(file)
self.tp = TextProcessor(path)
self.lda_dic = Dictionary.load(path + 'questions.lda_dic')
self.lda_tfidf = TfidfModel.load(path + 'questions.lda_tfidf')
self.lda_model = LdaMulticore.load(path + 'questions.lda_model')
self.d2v = Doc2Vec.load(path + 'questions.d2v')
self.features = {
'categorical': [],
'numerical': {
'zero': ['questions_body_length', 'questions_tag_count'],
'mean': []
},
'date': ['questions_date_added']
}
self._unroll_features()
# add tokens to corpus list
texts.append(preprocess(raw))
sys.stdout.write('\rCreating a list of tokenized documents: %d/%d documents processed...' % (len(texts),len(train_dict.values())))
sys.stdout.flush()
sys.stdout.write(' Done!\n')
# turn our tokenized documents into a id <-> term dictionary
if not os.path.isfile('./dictionary.dict'):
print 'Turn our tokenized documents into a id <-> term dictionary ...',
sys.stdout.flush()
dictionary = corpora.Dictionary(texts)
dictionary.save('./dictionary.dict')
else:
print 'Loading id <-> term dictionary from ./dictionary.dict ...',
sys.stdout.flush()
dictionary = corpora.Dictionary.load('./dictionary.dict')
print ' Done!'
# ignore words that appear in less than 20 documents or more than 50% documents
dictionary.filter_extremes(no_below=20, no_above=0.5)
# convert tokenized documents into a document-term matrix
if not os.path.isfile('./bow.mm'):
print 'Convert tokenized documents into a document-term matrix ...',
sys.stdout.flush()
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize('./bow.mm', corpus)
else:
print 'Loading document-term matrix from ./bow.mm ...',
sys.stdout.flush()
corpus = gensim.corpora.MmCorpus('./bow.mm')
print ' Done!'