Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with open('config.json') as confile:
conf = json.loads(confile.read())
db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets']
urls = defaultdict(int)
query = {}
#query["langs"] = "fr"
print "Counting matching results..."
count = db.count(query)
print "Querying and hashing results..."
bar = progressbar.ProgressBar(max_value=count)
for t in bar(db.find(query, limit=count, projection={"links": 1, "proper_links": 1})):
for l in t.get("proper_links", t["links"]):
d = normalize_url(l.split("/")[2])
urls[d] += 1
print "Sorting and storing csv data..."
with open("shared_domains.csv", "w") as f:
print >> f, "domain,shares"
bar = progressbar.ProgressBar(max_value=len(urls))
for link, shares in bar(sorted(urls.items(), key = lambda x: -x[1])):
print >> f, '%s,%s' % (format_csv(link), shares)
with open("users_urls_domains.csv", "w") as f:
print >> f, "user_screenname,user_id,url,normalized_url,domain,datetime,is_retweet,followers,has_media"
bar = progressbar.ProgressBar(max_value=count)
for t in bar(db.find(query, limit=count, projection={"user_screen_name": 1, "user_id_str": 1, "links": 1, "proper_links": 1, "retweet_id": 1, "created_at": 1, "user_followers": 1, "medias": 1})):
links = t.get("proper_links", t["links"])
if not links:
continue
name = t.get("user_screen_name")
uid = t.get("user_id_str")
isRT = 1 if t["retweet_id"] else 0
fols = t["user_followers"]
media = 1 if t["medias"] else 0
dtime = isodate(t["created_at"])
for l in links:
try:
lnk = normalize_url(l.encode("utf-8").replace("%0D", ""), strip_trailing_slash=True, strip_lang_subdomains=True)
except Exception as e:
print >> sys.stderr, "ERROR normalizing url", l, type(e), e
lnk = l
try:
domain = normalize_url(l.split("/")[2])
except Exception as e:
print >> sys.stderr, "ERROR normalizing domain for url", l, type(e), e
domain = ""
print >> f, ",".join([format_csv(v) for v in [name, uid, l, lnk, domain, dtime, str(isRT), str(fols), str(media)]])
if not links:
continue
name = t.get("user_screen_name")
uid = t.get("user_id_str")
isRT = 1 if t["retweet_id"] else 0
fols = t["user_followers"]
media = 1 if t["medias"] else 0
dtime = isodate(t["created_at"])
for l in links:
try:
lnk = normalize_url(l.encode("utf-8").replace("%0D", ""), strip_trailing_slash=True, strip_lang_subdomains=True)
except Exception as e:
print >> sys.stderr, "ERROR normalizing url", l, type(e), e
lnk = l
try:
domain = normalize_url(l.split("/")[2])
except Exception as e:
print >> sys.stderr, "ERROR normalizing domain for url", l, type(e), e
domain = ""
print >> f, ",".join([format_csv(v) for v in [name, uid, l, lnk, domain, dtime, str(isRT), str(fols), str(media)]])
from urllib.parse import urlsplit, parse_qsl
from collections import Counter
from tqdm import tqdm
from ural import normalize_url
TOP = 50
FRAGMENTS = Counter()
QUERIES = Counter()
QUERIES_COMBO = Counter()
with open('./scripts/data/urls.csv') as f:
for line in tqdm(f, desc='Reading urls'):
url = line.strip()[1:-1]
url = normalize_url(url, strip_protocol=False)
parsed = urlsplit(url)
FRAGMENTS[parsed.fragment] += 1
if parsed.query:
for name, value in parse_qsl(parsed.query):
QUERIES[name] += 1
QUERIES_COMBO['%s=%s' % (name, value)] += 1
def report(name, counter):
print()
title = 'Top %i %s:' % (TOP, name)
print(title)
print('-' * len(title))
def normalize_action(namespace):
sort_query = not namespace.no_query_sort
strip_authentication = not namespace.keep_authentication
strip_trailing_slash = namespace.strip_trailing_slash
strip_index = not namespace.keep_index
headers, position, reader = custom_reader(namespace.file, namespace.column)
headers.append(namespace.column + "_normalized")
writer = csv.writer(namespace.output)
writer.writerow(headers)
for line in reader:
url = line[position]
line.append(normalize_url(url, sort_query=sort_query, strip_authentication=strip_authentication,
strip_trailing_slash=strip_trailing_slash, strip_index=strip_index))
writer.writerow(line)