Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Prepare paths and parameters
if type(message['Metadata'])==dict: #support for cellprofiler --print-groups output
if message['output_structure']=='':
watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=str(message['Metadata'].values()),create_log_group=False)
logger.addHandler(watchtowerlogger)
printandlog('You must specify an output structure when passing Metadata as dictionaries',logger)
logger.removeHandler(watchtowerlogger)
return 'INPUT_PROBLEM'
else:
metadataID = message['output_structure']
metadataForCall = ''
for eachMetadata in message['Metadata'].keys():
if eachMetadata not in metadataID:
watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=str(message['Metadata'].values()),create_log_group=False)
logger.addHandler(watchtowerlogger)
printandlog('Your specified output structure does not match the Metadata passed',logger)
else:
metadataID = string.replace(metadataID,eachMetadata,message['Metadata'][eachMetadata])
metadataForCall+=eachMetadata+'='+message['Metadata'][eachMetadata]+','
message['Metadata']=metadataForCall[:-1]
elif 'output_structure' in message.keys():
if message['output_structure']!='': #support for explicit output structuring
watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=message['Metadata'],create_log_group=False)
logger.addHandler(watchtowerlogger)
metadataID = message['output_structure']
for eachMetadata in message['Metadata'].split(','):
if eachMetadata.split('=')[0] not in metadataID:
printandlog('Your specified output structure does not match the Metadata passed',logger)
else:
metadataID = string.replace(metadataID,eachMetadata.split('=')[0],eachMetadata.split('=')[1])
return 'INPUT_PROBLEM'
else:
metadataID = message['output_structure']
metadataForCall = ''
for eachMetadata in message['Metadata'].keys():
if eachMetadata not in metadataID:
watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=str(message['Metadata'].values()),create_log_group=False)
logger.addHandler(watchtowerlogger)
printandlog('Your specified output structure does not match the Metadata passed',logger)
else:
metadataID = string.replace(metadataID,eachMetadata,message['Metadata'][eachMetadata])
metadataForCall+=eachMetadata+'='+message['Metadata'][eachMetadata]+','
message['Metadata']=metadataForCall[:-1]
elif 'output_structure' in message.keys():
if message['output_structure']!='': #support for explicit output structuring
watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=message['Metadata'],create_log_group=False)
logger.addHandler(watchtowerlogger)
metadataID = message['output_structure']
for eachMetadata in message['Metadata'].split(','):
if eachMetadata.split('=')[0] not in metadataID:
printandlog('Your specified output structure does not match the Metadata passed',logger)
else:
metadataID = string.replace(metadataID,eachMetadata.split('=')[0],eachMetadata.split('=')[1])
printandlog('metadataID ='+metadataID, logger)
else: #backwards compatability with 1.0.0 and/or no desire to structure output
metadataID = '-'.join([x.split('=')[1] for x in message['Metadata'].split(',')]) # Strip equal signs from the metadata
else: #backwards compatability with 1.0.0 and/or no desire to structure output
metadataID = '-'.join([x.split('=')[1] for x in message['Metadata'].split(',')]) # Strip equal signs from the metadata
localOut = LOCAL_OUTPUT + '/%(MetadataID)s' % {'MetadataID': metadataID}
remoteOut= os.path.join(message['output'],metadataID)
replaceValues = {'PL':message['pipeline'], 'OUT':localOut, 'FL':message['data_file'],
help="Path to projects signup questionnaire")
parser.add_argument("--auth", default="GITHUB_API")
parser.add_argument("--outdir", "-o", default="build")
parser.add_argument("--per_page", "-n", default=100)
parser.add_argument("--max_pages", "-m", default=100)
parser.add_argument("--since", "-s", default="2017-01-01",
help="Date from which to search, YYYY-MM-DD")
args = parser.parse_args()
# Generate the github API user:token pair
# auth_user = args.auth_user
# auth_token = get_API_token(args.auth_token)
# auth = ':'.join([auth_user, auth_token])
# XXX should fix this to have choldgraf's technic work on travis
auth = get_API_token(args.auth)
per_page = args.per_page,
max_pages = args.max_pages
since = args.since
# Load data from google drive questionnaire
projects = pd.read_csv(args.filename)
# Pull user/project info
columns = ['Github organization and project (if applicable)', 'branch']
projects = projects[columns].values
# Remove missing projects
projects = [ii.split('/') + [br] for ii, br in projects if isinstance(ii, str)]
# Remove non GH projects
projects = [ii for ii in projects if len(ii) == 3]
# Iterate projects and retrieve its latest info
def write(db, hits, verbosity=0):
i = len(hits)
hit_objs = []
for hit in hits:
if verbosity > 0 and i > 0:
if verbosity > 2:
print(json.dumps(hit, indent=2))
hit_obj = Hit(
path=hit["path"],
method=hit["method"],
ip=hit["ip"],
user_agent=hit["user_agent"],
authenticated=convertBool(hit["is_authenticated"]),
staff=convertBool(hit["is_staff"]),
superuser=convertBool(hit["is_superuser"]),
username=hit["user"],
referer=hit["referer"],
view=hit["view"],
module=hit["module"],
status_code=int(hit["status_code"]),
reason_phrase=hit["reason_phrase"],
request_time=float(hit["request_time"]),
doc_size=hit["doc_size"],
num_queries=int(hit["num_queries"]),
# Clean up
all_dates = all_dates.drop(0, axis=1)
all_dates = all_dates.replace(np.nan, 0)
all_dates = all_dates.astype(int)
return all_dates
# --- Run the script ---
try:
os.makedirs("build/images")
except OSError:
pass
meta = pd.read_csv('.project_info.csv')
db = GithubDatabase()
projects = [ii.split('/')[-2:] for ii in db.projects]
groupby = 'weekday'
start = '2017-03-01'
stop = '2017-03-13'
exceptions = []
all_dates = []
for user, project in tqdm(projects):
try:
this_meta = meta.query('github_org == "{}/{}"'.format(user, project))
words = this_meta['words'].values[0]
words = None if isinstance(words, float) else words
words = '' if words == '' else words
branch = this_meta['branch'].values[0]
branch = None if isinstance(branch, float) else branch
projects = projects[list(rename.values())]
projects['url'] = projects['url'].apply(validate_url)
def is_doc(row):
doc_words = ['doc', 'documentation', 'docathon']
is_doc = 0
for word in doc_words:
if word in row['title']:
is_doc += 1
if word in row['label_names']:
is_doc += 1
return is_doc > 0
projects['doc_issues'] = None
db = GithubDatabase()
for ix, project in tqdm(projects.iterrows()):
if not isinstance(project['github_org'], str):
continue
org, repo = project['github_org'].split('/')[-2:]
proj = db.load(org, repo)
if proj.issues is None:
continue
issues = proj.issues.query('state == "open"')
issues = issues[pd.isnull(issues['pull_request'])]
if len(issues) == 0:
print('{}: No open issues w/o a PR'.format(repo))
continue
issues['is_doc'] = issues.apply(is_doc, axis=1)
doc_issues = [{'url': issue['html_url'],
'title': issue['title']}
for ix, issue in issues.iterrows()
changes = []
for ifile in resp['files']:
found = 0
for word in search_words:
ifilename = ifile['filename'].lower()
if word in ifilename and not any(bword in ifilename for bword in DOC_CHEATING_WORDS):
found += 1
found = found > 0
changes.append({'filename': ifile['filename'],
'changes': ifile['changes'],
'additions': ifile['additions'],
'deletions': ifile['deletions'],
'found': found})
return pd.DataFrame(changes)
db = GithubDatabase()
meta = pd.read_csv('.project_info.csv')
all_diffs = []
for proj in db.projects:
print(proj)
user, project = proj.split('/')
this_meta = meta.query('github_org == "{}/{}"'.format(user, project))
branch = this_meta['branch'].values[0]
branch = None if isinstance(branch, float) else branch
diffs = find_commit_diffs(user, project, branch=branch)
if diffs is None or len(diffs) == 0:
continue
diffs['project'] = project
diffs['user'] = user
all_diffs.append(diffs)
projects = pd.read_csv(args.filename)
# Pull user/project info
columns = ['Github organization and project (if applicable)', 'branch']
projects = projects[columns].values
# Remove missing projects
projects = [ii.split('/') + [br] for ii, br in projects if isinstance(ii, str)]
# Remove non GH projects
projects = [ii for ii in projects if len(ii) == 3]
# Iterate projects and retrieve its latest info
print('Updating commits for %s projects' % len(projects))
exceptions = []
for user, project, branch in projects:
try:
branch = None if isinstance(branch, float) else branch
commits_.update_commits(user, project, auth,
since=since, branch=branch)
issues_.update_issues(user, project, auth,
since=since, state='all')
except Exception as e:
exceptions.append(project)
print('Finished updating commits.\nFailed for: {}'.format(exceptions))
def decodeHitRow(row):
global G
vals = row.decode().split(SEPARATOR)
data = {}
data["site"] = vals[0]
data["path"] = vals[1]
data["method"] = vals[2]
data["ip"] = vals[3]
data["user_agent"] = vals[4]
data["is_authenticated"] = vals[5]
data["is_staff"] = vals[6]
data["is_superuser"] = vals[7]
data["user"] = vals[8]
data["referer"] = vals[9]
data["view"] = vals[10]
data["module"] = vals[11]
data["status_code"] = int(vals[12])
data["reason_phrase"] = vals[13]
data["request_time"] = int(vals[14])
def decodeEventRow(row):
row = row.decode("utf-8")
event = {}
for el in row.split(SEPARATOR):
t = el.split(":;")
k = t[0]
v = t[1]
event[k] = v
return event