Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def gcp_file_reader(fn):
rdr = csv.DictReader(open(fn, 'r'), delimiter=str('\t'))
for rec in rdr:
if rec['id'].startswith('#'):
continue
yield rec
code_set = self.filesystem.read_yaml(
self.config.build.codes_file.format(table_name))
except IOError:
# For the first run, when the field analysis hasn't yet been done.
from collections import defaultdict
datatypes = defaultdict(lambda:{'type':'varchar'})
code_set = defaultdict(lambda:[])
with self.session as s:
table = self.schema.add_table('api_{}'.format(group))
with open(self.filesystem.path('meta',
'most_recent_fields_{}.csv'.format(group))) as f:
reader = csv.DictReader(f)
for row in reader:
if row['name'] == 'id':
pk = True
else:
pk = False
datatype = datatypes[row['name']]['type']
c = self.schema.add_column(table, row['name'],
description=row['description'],
datatype=datatype if not pk else 'integer',
is_primary_key = pk,
data = {'codes':','.join(code_set[row['name']])
if row['name'] in code_set else None}
rows = []
with b.source_fs.open('source_schema.csv',encoding='utf8') as f:
r = csv.reader(f)
headers = next(r)
for row in r:
d = dict(zip(headers, row))
d['dest_header'] = 'X'+d['source_header']
rows.append(d)
# Fails with: TypeError: must be unicode, not str
# with b.source_fs.open('source_schema.csv', 'w',encoding='utf8') as f:
path = b.source_fs.getsyspath('source_schema.csv')
with open(path, 'w') as f:
w = csv.DictWriter(f,fieldnames=headers)
w.writeheader()
for row in rows:
w.writerow(row)
b.sync_in()
self.assertEqual([u'int', u'float', u'string', u'time', u'date'],
[ c.source_header for c in b.dataset.source_table('types1').columns])
b.clean_ingested()
b.ingest(tables=['types'])
self.assertEqual([u'int', u'float', u'string', u'time', u'date'],
[ c.source_header for c in b.dataset.source_table('types1').columns])
conn.execute("""
INSERT INTO {} VALUES (
'{}', '{}'
)
""".format(mysql_table, *db_record))
from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
import unicodecsv as csv
op = MySqlToHiveTransfer(
task_id='test_m2h',
hive_cli_conn_id='hive_cli_default',
sql="SELECT * FROM {}".format(mysql_table),
hive_table=hive_table,
recreate=True,
delimiter=",",
quoting=csv.QUOTE_NONE,
quotechar='',
escapechar='@',
dag=self.dag)
op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
from airflow.hooks.hive_hooks import HiveServer2Hook
hive_hook = HiveServer2Hook()
result = hive_hook.get_records("SELECT * FROM {}".format(hive_table))
self.assertEqual(result[0], db_record)
finally:
with hook.get_conn() as conn:
conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
verbosity, no_comments,
comments, **dummy):
try:
# Read and skip comments at start of CSV file.
csvcomments = ''
if infile:
csvFile = file(infile, 'r')
while True:
line = csvFile.readline()
if line[:1] == '#':
csvcomments += line
else:
csvfieldnames = next(unicodecsv.reader([line]))
break
if not no_comments:
logfilename = outfile.rsplit('.',1)[0] + '.log'
if os.path.isfile(logfilename):
incomments = open(logfilename, 'r').read()
else:
incomments = ''
logfile = open(logfilename, 'w')
logfile.write(comments)
logfile.write(csvcomments)
logfile.write(incomments)
logfile.close()
norm = NVivoNorm(outfile)
norm.begin()
if tagging:
exec("\
def evaltagging(sourceRow, csvRow):\n\
return " + tagging, globals())
# Read and skip comments at start of CSV file.
csvcomments = ''
if infile:
csvFile = file(infile, 'r')
while True:
line = csvFile.readline()
if line[:1] == '#':
csvcomments += line
else:
csvfieldnames = next(unicodecsv.reader([line]))
break
if not no_comments:
logfilename = outfile.rsplit('.',1)[0] + '.log'
if os.path.isfile(logfilename):
incomments = open(logfilename, 'r').read()
else:
incomments = ''
logfile = open(logfilename, 'w')
logfile.write(comments.encode('utf8'))
logfile.write(csvcomments)
logfile.write(incomments)
logfile.close()
norm = NVivoNorm(outfile)
norm.begin()
def load_gaz(gaz_fn):
template = {'GPE': [], 'LOC': [], 'ORG': [], 'PER': []}
gaz = {
'amh': copy.copy(template),
'eng': copy.copy(template),
'deu': copy.copy(template),
'orm': copy.copy(template),
'som': copy.copy(template),
'tir': copy.copy(template),
}
with open(gaz_fn, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
next(reader)
for fields in reader:
eng, lab, tir, tir_ipa, orm, orm_ipa, wik, id_, _ = fields
if not lab:
if len(eng.split()) == 1:
lab = 'GPE'
if tir and lab:
for v in get_variants(tir):
gaz['tir'][lab].append(v)
if orm and lab:
for v in get_variants(orm):
gaz['orm'][lab].append(v)
return gaz
17: 'grandchild',
18: 'childinlaw',
19: 'student',
20: 'member',
21: 'correspondent',
22: 'opposed',
23: 'cousin',
}
tsv.seek(0)
next(tsv)
next(tsv)
next(tsv)
next(tsv)
print "Adding relationships"
for l in csv.reader(tsv, dialect="excel-tab"):
key = l[0].encode('ascii', errors='ignore')
p = Person().load({"key": key})
for i, type in rowmap.items():
if l[i]:
for pkey in l[i].split(","):
pkey = pkey.strip().encode('ascii', errors='ignore')
print "{} - {}".format(key, pkey)
if Person().load({"key": pkey}):
pr = PersonRelationship({
"type": type,
"from_key": key,
"to_key": pkey
})
pr.save()
def doAttrInserts(csv_file, db):
inserts = defaultdict(list)
insertNum = 0
with open(csv_file) as csvFile:
reader = unicodecsv.DictReader(csvFile,encoding='utf-8')
for line in reader:
for field_name in line:
if field_name == 'id' or field_name== 'name':
continue
attr_type = available_attrs[str(field_name)][1]
inserts[attr_type].append({'attr_name': field_name, 'value': line[field_name], 'id': line['id']})
if len(inserts[attr_type]) > DB_BATCH_SIZE:
insertNum+=1
reallyDoInserts(inserts[attr_type], attr_insert_map[attr_type], insertNum, db)
del inserts[attr_type]
for index, insert_label in enumerate(inserts, start=1):
insertNum+=1
reallyDoInserts(inserts[insert_label], attr_insert_map[insert_label], insertNum, db)
db.commit()
print 'finished attribute inserts'
lookup_key = self.strip_hash.sub('', attrs['xlink:href'])
if self.lookup['thoroughfare'].get(lookup_key) is not None:
self.object['street'] = self.lookup['thoroughfare'].get(lookup_key)
elif self.lookup['admin'].get(lookup_key) is not None:
self.object['admin'] = self.lookup['admin'].get(lookup_key)
elif self.lookup['postal'].get(lookup_key) is not None:
self.object['postcode'] = self.lookup['postal'].get(lookup_key)
# detect SRS, create CSV writer if necessary
if name == 'gml:Point':
self.srs = attrs.get('srsName', None)
if self.srs is not None:
self.srs = self.srs.split(':')[-1]
if not self.srs in self.writers:
self.writers[self.srs] = csv.DictWriter(open(self.out_dir + 'es-%s.csv' % self.srs, 'a'), ('lon', 'lat', 'number', 'street', 'postcode', 'admin'))
self.writers[self.srs].writeheader()