Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
global args
if job_input == None:
temp = vars(parser.parse_args(sys.argv[1:]))
for key in temp:
if temp[key] != None:
if key == 'tags':
args[key] = temp[key].split(",")
# remove whitespace around tags
for i in range(len(args[key])):
args[key][i] = args[key][i].rstrip().lstrip()
elif key == 'properties':
try:
args[key] = ast.literal_eval(temp[key])
except SyntaxError:
raise dxpy.AppError("Cannot parse properties: " + temp[key])
else:
args[key] = temp[key]
else:
args = job_input
print(args)
if 'file2' in args:
paired = True
else:
paired = False
is_fasta, is_colorspace, qual_encoding = sniff_fastq(args["file"])
if is_fasta == False and ('qual' in args or 'qual2' in args):
def unpack(input):
m = magic.Magic()
# determine compression format
try:
file_type = m.from_file(input)
except Exception as e:
raise dxpy.AppError("Error while identifying compression format: " + str(e))
# if we find a tar file throw a program error telling the user to unpack it
if file_type == 'application/x-tar':
raise dxpy.AppError("App does not support tar files. Please unpack.")
# since we haven't returned, the file is compressed. Determine what program to use to uncompress
uncomp_util = None
if file_type == 'XZ compressed data':
uncomp_util = 'xzcat'
elif file_type[:21] == 'bzip2 compressed data':
uncomp_util = 'bzcat'
elif file_type[:20] == 'gzip compressed data':
uncomp_util = 'zcat'
elif file_type == 'POSIX tar archive (GNU)' or 'tar' in file_type:
raise dxpy.AppError("Found a tar archive. Please untar your sequences before importing")
else:
# just return input filename since it's already uncompressed
return input
if uncomp_util != None:
# if we find a tar file throw a program error telling the user to unpack it
if file_type == 'application/x-tar':
raise dxpy.AppError("App does not support tar files. Please unpack.")
# since we haven't returned, the file is compressed. Determine what program to use to uncompress
uncomp_util = None
if file_type == 'XZ compressed data':
uncomp_util = 'xzcat'
elif file_type[:21] == 'bzip2 compressed data':
uncomp_util = 'bzcat'
elif file_type[:20] == 'gzip compressed data':
uncomp_util = 'zcat'
elif file_type == 'POSIX tar archive (GNU)' or 'tar' in file_type:
raise dxpy.AppError("Found a tar archive. Please untar your sequences before importing")
else:
# just return input filename since it's already uncompressed
return input
if uncomp_util != None:
# bzcat does not support -t. Use non streaming decompressors for testing input
test_util = None
if uncomp_util == 'xzcat':
test_util = 'xz'
elif uncomp_util == 'bzcat':
test_util = 'bzip2'
elif uncomp_util == 'zcat':
test_util = 'gzip'
try:
try:
subprocess.check_call(" ".join([test_util, "-t", input]), shell=True)
except subprocess.CalledProcessError:
raise dxpy.AppError("File failed integrity check by "+uncomp_util+". Compressed file is corrupted.")
# with that in hand, unzip file. If we find a tar archive then exit with error.
try:
with subprocess.Popen([uncomp_util, input], stdout=subprocess.PIPE).stdout as pipe:
line = pipe.next()
uncomp_type = m.from_buffer(line)
except:
raise dxpy.AppError("Error detecting file format after decompression")
if uncomp_type == 'POSIX tar archive (GNU)' or 'tar' in uncomp_type:
raise dxpy.AppError("Found a tar archive after decompression. Please untar your files before importing")
#elif 'ASCII text' not in uncomp_type:
#raise dxpy.AppError("After decompression found file type other than plain text")
try:
subprocess.check_call(" ".join([uncomp_util, "--stdout", input, ">", "uncompressed.gtf"]), shell=True)
return "uncompressed.gtf"
except subprocess.CalledProcessError:
raise dxpy.AppError("Unable to open compressed input for reading")
biotypePresent = False
if "gene_biotype" in table.get_col_names():
biotypePresent = True
for row in table.iterate_rows(want_dict=True):
if row["type"] == "gene":
if genes.get(row["span_id"]) == None:
genes[row["span_id"]] = str(row["span_id"])
if row.get("gene_id") != None:
if row["gene_id"] != "":
genes[row["span_id"]] = row["gene_id"]
if row.get("name") != None and genes[row["span_id"]] == str(row["span_id"]):
if row["name"] != '':
genes[row["span_id"]] = row["name"]
else:
raise dxpy.AppError("Error: span_id was not unique, in violation of the type spec for Genes. As a result, some gene_id data may be overwritten")
if row["type"] == "transcript":
if transcripts.get(row["span_id"]) == None:
transcriptInfo = {"name": str(row["span_id"])}
if row.get("gene_id") != None:
if row["transcript_id"] != '':
transcriptInfo["name"] = row["transcript_id"]
if row.get("name") != None and transcriptInfo["name"] == str(row["span_id"]):
if row["name"] != '':
transcriptInfo["name"] = row["name"]
transcriptInfo['parent'] = row["parent_id"]
transcriptInfo['gene'] = ''
transcripts[row["span_id"]] = transcriptInfo
else:
raise dxpy.AppError("Error: span_id was not unique, in violation of the type spec for Genes. As a result, some transcript_id data may be overwritten")
else:
readName = values["name"]
if readName.strip("@") == "":
readName = "*"
if values.get("quality") == None or values.get("quality") == "":
qual = "*"
else:
qual = values["quality"].rstrip('\n')
seq = values["sequence"]
if values["negative_strand"]:
try:
seq = reverseComplement(seq)
except ValueError as e:
raise dxpy.AppError("Error converting row %d: %s" % (row["__id__"], e.message))
qual = qual[::-1]
if values["mate_id"] == -1 or values["chr"] != values["chr2"] or values["chr"] == '' or values["chr"] == '*':
tlen = 0
else:
tlen = (max(int(values["hi2"]),int(values["hi"])) - min(int(values["lo2"]),int(values["lo"])))
if int(values["lo"]) > int(values["lo2"]):
tlen *= -1
out_row = [readName.strip("@"), str(flag), chromosome, str(lo), str(values["error_probability"]), values["cigar"] , chromosome2, str(lo2), str(tlen), seq, qual]
tag_values = {c: values[c] for c in sam_col_names if not tag_value_is_default(values[c])}
out_row.extend([format_tag_field(name, value, sam_col_types) for name, value in tag_values.iteritems()])
if assignReadGroup != "":
out_row.append("RG:Z:" + assignReadGroup)
capturedTypes = {"5UTR": "5' UTR", "3UTR": "3' UTR", "CDS": "CDS", "inter": "intergenic", "inter_CNS": "intergenic_conserved", "intron_CNS": "intron_conserved", "exon": "exon", "transcript": "transcript", "gene":"gene", "stop_codon": "stop_codon", "start_codon":"start_codon"}
#Rows of this type will not be written to the gtable as their information is fully encompassed by the rest of the data
##Isolate the attribute tags from the file and check integrity
spansTable, additionalColumns = constructTable(inputFileName)
spansTable.add_tags(tag)
types = ["Genes", "gri"]
for x in additional_type:
types.append(x)
spansTable.add_types(types)
details = {'original_contigset': dxpy.dxlink(reference)}
if len(property_key) != len(property_value):
raise dxpy.AppError("Expected each provided property to have a corresponding value")
for i in range(len(property_key)):
details[property_key[i]] = property_value[i]
for x in additional_type:
types.append(x)
if file_id != None:
details['original_file'] = dxpy.dxlink(file_id)
spansTable.set_details(details)
if outputName == '':
spansTable.rename(fileName)
else:
spansTable.rename(outputName)
#This passes through the file calculates the gene and transcript models
genes = {}
transcripts = {}
delimiter = detect_type(import_filename)["delimiter"]
print("Bed type is : " + bed_type, file=sys.stderr)
if bed_type == "genes":
print("Importing as Genes Type", file=sys.stderr)
job_outputs.append(import_genes(import_filename, name, reference, file_id, additional_types, property_keys, property_values, tags, delimiter))
elif bed_type == "spans" or bed_type == "bedDetail":
print("Importing as Spans Type", file=sys.stderr)
if bed_type == "bedDetail":
print("input file is in 'bedDetails' format...", file=sys.stderr)
bedDetail=True
else:
bedDetail=False
job_outputs.append(import_spans(import_filename, name, reference, file_id, additional_types, property_keys, property_values, tags, bedDetail, delimiter))
else:
raise dxpy.AppError("Unable to determine type of BED file")
subprocess.check_call(" ".join(["rm", import_filename]), shell=True)
if(bed_filename != bed_filename_uncomp):
subprocess.check_call(" ".join(["rm", bed_filename_uncomp]), shell=True)
print(json.dumps(job_outputs))
return job_outputs