Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _python_create_file(filename):
if sys.version_info >= (3,):
schema = avro.schema.Parse(json_schema)
else:
schema = avro.schema.parse(json_schema)
fp = open(filename, 'wb')
writer = avro.datafile.DataFileWriter(fp, avro.io.DatumWriter(), schema)
for i in range(1):
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()
fp.close()
for key in data_args.primary_key_cols.split(','):
rows |= 'Enforcing primary key: {}'.format(key) >> EnforcePrimaryKeys(
key)
if data_args.csv_schema_order:
(rows
| 'Order fields for CSV writing.' >> beam.FlatMap(lambda d:
[dict_to_csv(d, data_args.csv_schema_order.split(','))])
| 'Write to GCS' >> beam.io.textio.WriteToText(
file_path_prefix=data_args.output_prefix,
file_name_suffix='.csv')
)
if data_args.avro_schema_file:
avsc = avro.schema.parse(open(data_args.avro_schema_file, 'rb').read())
(rows
# Need to convert time stamps from strings to timestamp-micros
| 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row:
fix_record_for_avro(row, avsc))
| 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
file_path_prefix=data_args.output_prefix,
codec='null',
file_name_suffix='.avro',
use_fastavro=True,
schema=avsc
)
)
if data_args.output_bq_table:
(rows
# Load the Avro schemas
try:
# This holds the schema for AlleleGroups
allele_group_schema = avro.schema.parse(
open(options.allele_group_schema).read())
except IOError:
logging.critical("Could not load Avro AlleleGroup schema {}".format(
options.allele_group_schema))
logging.critical("Check your --allele_group_schema option")
sys.exit(1)
try:
# This holds the schema for Adjacencies
adjacency_schema = avro.schema.parse(
open(options.adjacency_schema).read())
except IOError:
logging.critical("Could not load Avro Adjacency schema {}".format(
options.allelegroup_schema))
logging.critical("Check your --adjacency_schema option")
sys.exit(1)
# Make Avro-format output file writers. This one is for allele groups.
allele_group_writer = avro.datafile.DataFileWriter(
options.allele_group_file, avro.io.DatumWriter(), allele_group_schema)
# This one is for adjacencies
adjacency_writer = avro.datafile.DataFileWriter(options.adjacency_file,
avro.io.DatumWriter(), adjacency_schema)
# Make a VCF reader to read the input VCF
vcf_reader = vcf.Reader(options.vcf)
self.set_meta('avro.codec', codec)
self.set_meta('avro.schema', str(writers_schema))
self.datum_writer.writers_schema = writers_schema
else:
# open writer for reading to collect metadata
dfr = DataFileReader(writer, avro.io.DatumReader())
# TODO(hammer): collect arbitrary metadata
# collect metadata
self._sync_marker = dfr.sync_marker
self.set_meta('avro.codec', dfr.get_meta('avro.codec'))
# get schema used to write existing file
schema_from_file = dfr.get_meta('avro.schema')
self.set_meta('avro.schema', schema_from_file)
self.datum_writer.writers_schema = avro.schema.parse(schema_from_file)
# seek to the end of the file and prepare for writing
writer.seek(0, 2)
self._header_written = True
"""
class ListReferenceBasesRequest(ProtocolElement):
"""
The query parameters for a request to GET /references/{id}/bases,
for example: GET /references/{id}/bases?start=100&end=200
"""
_schemaSource = """
{"namespace": "org.ga4gh.methods", "type": "record", "name":
"ListReferenceBasesRequest", "fields": [{"default": 0, "doc": "",
"type": "long", "name": "start"}, {"default": null, "doc": "", "type":
["null", "long"], "name": "end"}, {"default": null, "doc": "", "type":
["null", "string"], "name": "pageToken"}], "doc": ""}
"""
schema = avro.schema.parse(_schemaSource)
requiredFields = set([])
@classmethod
def isEmbeddedType(cls, fieldName):
embeddedTypes = {}
return fieldName in embeddedTypes
@classmethod
def getEmbeddedType(cls, fieldName):
embeddedTypes = {}
return embeddedTypes[fieldName]
__slots__ = [
'end', 'pageToken', 'start'
]
def _schema(self):
# Keeping this as an instance method because of issues with sharing
# this data across processes.
schema_path = os.path.join(
os.path.dirname(__file__),
'schemas/envelope_v1.avsc'
)
return avro.schema.parse(open(schema_path).read())
References are designed to be immutable.
"""
_schemaSource = """
{"namespace": "org.ga4gh.models", "type": "record", "name":
"Reference", "fields": [{"doc": "", "type": "string", "name": "id"},
{"doc": "", "type": "long", "name": "length"}, {"doc": "", "type":
"string", "name": "md5checksum"}, {"doc": "", "type": "string",
"name": "name"}, {"default": null, "doc": "", "type": ["null",
"string"], "name": "sourceURI"}, {"doc": "", "type": {"items":
"string", "type": "array"}, "name": "sourceAccessions"}, {"default":
false, "doc": "", "type": "boolean", "name": "isDerived"}, {"default":
null, "doc": "", "type": ["null", "float"], "name":
"sourceDivergence"}, {"default": null, "doc": "", "type": ["null",
"int"], "name": "ncbiTaxonId"}], "doc": ""}
"""
schema = avro.schema.parse(_schemaSource)
requiredFields = set([
"id",
"length",
"md5checksum",
"name",
"sourceAccessions",
])
@classmethod
def isEmbeddedType(cls, fieldName):
embeddedTypes = {}
return fieldName in embeddedTypes
@classmethod
def getEmbeddedType(cls, fieldName):
embeddedTypes = {}
def main():
if len(sys.argv) < 2:
print "Usage: cat input.json | python2.7 JSONtoAvro.py output"
return
s = schema.parse(open("tweet.avsc").read())
f = open(sys.argv[1], "wb")
writer = datafile.DataFileWriter(f, io.DatumWriter(), s, codec = 'deflate')
failed = 0
for line in sys.stdin:
line = line.strip()
try:
data = json.loads(line)
except ValueError as detail:
continue
try:
writer.append(data)
possibly work' for grouping data (e.g. Dataset X has all the
reads, variants, and expression levels for a particular research
project; Dataset Y has all the work product from a particular
grant). For data accessors, they're a simple way to scope
exploration and analysis (e.g. are there any supporting examples
in 1000genomes? what's the distribution of that result in the data
from our project?)
"""
_schemaSource = """
{"namespace": "org.ga4gh.models", "type": "record", "name": "Dataset",
"fields": [{"doc": "", "type": "string", "name": "id"}, {"default":
null, "doc": "", "type": ["null", "string"], "name": "name"},
{"default": null, "doc": "", "type": ["null", "string"], "name":
"description"}], "doc": ""}
"""
schema = avro.schema.parse(_schemaSource)
requiredFields = set([
"id",
])
@classmethod
def isEmbeddedType(cls, fieldName):
embeddedTypes = {}
return fieldName in embeddedTypes
@classmethod
def getEmbeddedType(cls, fieldName):
embeddedTypes = {}
return embeddedTypes[fieldName]
__slots__ = [
def schema_for_item(value, array_schema, tool):
if not array_schema:
return None
opts = array_schema.get('items', [])
if not opts:
return None
if not isinstance(opts, list):
opts = [opts]
opts = [schema_by_name(opt, tool) for opt in opts]
if len(opts) == 1:
return opts[0]
for opt in opts:
sch = avro.schema.parse(json.dumps(opt))
if avro.io.validate(sch, value):
return opt
return None