How to use the pygtftk.utils.make_tmp_file function in pygtftk

To help you get started, we’ve selected a few pygtftk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dputhier / pygtftk / pygtftk / plugins / divergent.py View on Github external
2, 3,
                                                      4, 5])
    message("Intersecting...")

    if no_strandness:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=False)
    else:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=True)

    tmp_file = make_tmp_file("promoter_slop", ".bed")
    promoter_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
    prom_with_tss_bo.saveas(tmp_file.name)

    for i in prom_with_tss_bo:

        tx_id_tss, gn_id_tss = i.fields[9].split("||")
        tx_id_prom, gene_id_prom = i.fields[3].split("||")

        if gene_id_prom != gn_id_tss:
            if tx_id_prom in tx_with_divergent:
                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                if dist < dist_to_divergent[tx_id_prom]:
                    dist_to_divergent[tx_id_prom] = dist
                    tx_with_divergent[tx_id_prom] = tx_id_tss
            else:
github dputhier / pygtftk / pygtftk / plugins / mk_matrix.py View on Github external
message("Getting TTS (using %d bins)." % around_bin_nb)
                dws_region_bo = tmp.get_tts(name=["transcript_id"]
                                            ).slop(s=True,
                                                   l=-1,
                                                   r=downstream,
                                                   g=chrom_info.name).cut(bed_col)
            else:
                message(
                    "Getting downstream regions (%d bins)." %
                    around_bin_nb)

                dws_region_bo = main_region_bo.flank(s=True,
                                                     l=0,
                                                     r=downstream,
                                                     g=chrom_info.name)
            dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type,
                                         suffix=".bed")

            dws_region_bo.saveas(dws_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[
                                         x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="downstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)
github dputhier / pygtftk / pygtftk / plugins / discretize_key.py View on Github external
cat_label = [str(x).replace(", ", "_") for x in cat_label]

        # The string can be very problematic later...
        breaks.categories = cat_label

    message("Categories: " + str(list(breaks.categories)),
            type="INFO",
            force=True)

    # -------------------------------------------------------------------------
    #
    # Write to disk
    #
    # -------------------------------------------------------------------------

    tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt")

    with tmp_file as tp_file:
        for p, v in zip(dest_pos, breaks):
            tp_file.write(str(p) + "\t" + str(v) + '\n')

    gtf.add_attr_to_pos(tmp_file,
                        new_key=dest_key).write(outputfile,
                                                gc_off=True)

    close_properly(outputfile, inputfile)
github dputhier / pygtftk / pygtftk / plugins / great_reg_domains.py View on Github external
# Ignore features in B that overlap A
                                                         io=True,
                                                         # In addition to the closest feature in B report distance
                                                         # use negative distances to report upstream features.
                                                         # Report distance with respect to A.
                                                         # When A is on the - strand, "upstream" means B has a
                                                         # higher(start, stop).
                                                         D="a",
                                                         # Ignore features in B that are upstream of features in A
                                                         iu=True,
                                                         # How ties are handled. "first"  Report the first tie
                                                         t="first",
                                                         # Require that the query and the closest hit have different names/gene_ids.
                                                         N=True)

        basal_reg_bed_downstream_file = make_tmp_file(prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name)

        for line in basal_reg_bed_downstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':
                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
github dputhier / pygtftk / pygtftk / plugins / alt_prom_cov.py View on Github external
del gtf

    # -------------------------------------------------------------------------
    # Check whether TSSs intersect a H3K4me3 peak
    #
    # -------------------------------------------------------------------------

    message("Checking TSS intersection with merged BED file.")

    tx_to_peak = defaultdict()

    merged_peaks_bo = BedTool(merged_peaks)

    tss_bo_x_bf = tss_bo.intersect(merged_peaks_bo, wa=True, wb=True)
    tmp_bed = make_tmp_file(prefix="TSS_intersect",
                            suffix=".bed")
    tss_bo_x_bf.saveas(tmp_bed.name)

    for line in tss_bo_x_bf:
        tx_to_peak[line[3]] = line[9]

    # -------------------------------------------------------------------------
    # Compute coverage of requested region
    # Each worker will send a file
    # -------------------------------------------------------------------------

    outputfile_list = {}
    message("Starting coverage analysis.")

    bed_cov_file = make_tmp_file(prefix="merged_peaks", suffix=".cov")
    bed_cov = bw_cov_mp(bw_list=[x.name for x in bigwig_list],
github dputhier / pygtftk / pygtftk / plugins / ologram.py View on Github external
minibatch_size=minibatch_size, minibatch_nb=minibatch_nb,
                              bed_excl=bed_excl, use_markov_shuffling=use_markov,
                              nb_threads=nb_threads)

    # Initialize result dict
    hits = dict()

    if not no_gtf:
        if not no_basic_feature:
            for feat_type in gtf.get_feature_list(nr=True):
                message("Processing " + str(feat_type), type="INFO")
                gtf_sub = gtf.select_by_key("feature", feat_type, 0)
                gtf_sub_bed = gtf_sub.to_bed(name=["transcript_id",
                                                   "gene_id",
                                                   "exon_id"]).sort().merge()  # merging bed file !
                tmp_file = make_tmp_file(prefix="ologram_" + str(feat_type), suffix='.bed')
                gtf_sub_bed.saveas(tmp_file.name)

                del gtf_sub

                hits[feat_type] = overlap_partial(bedA=peak_file, bedsB=gtf_sub_bed, ft_type=feat_type)

            nb_gene_line = len(gtf.select_by_key(key="feature", value="gene"))
            nb_tx_line = len(gtf.select_by_key(key="feature", value="transcript"))

            if nb_gene_line and nb_tx_line:
                # -------------------------------------------------------------------------
                # Get the intergenic regions
                # -------------------------------------------------------------------------
                message("Processing intergenic regions", type="INFO")
                gtf_sub_bed = gtf.get_intergenic(chrom_info,
                                                 0,
github dputhier / pygtftk / pygtftk / plugins / mk_matrix.py View on Github external
else:
        message("Loading regions")

        if ft_type == 'user_regions':
            main_region_bo = BedTool(inputfile.name).cut(bed_col)
        elif ft_type == 'single_nuc':
            main_region_bo = BedTool(inputfile.name).cut(bed_col).slop(s=True,
                                                                       l=upstream,
                                                                       r=downstream,
                                                                       g=chrom_info.name)
        else:
            message("Unknown method.")

    # Save for tracability
    main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed")
    main_region_bo.saveas(main_region_bed.name)

    # -------------------------------------------------------------------------
    #
    # Print a header in the output file
    #
    # -------------------------------------------------------------------------
    message("Preparing comments")

    comments = "#"
    comments += "ft_type:" + ft_type + ";"
    comments += "from:" + str(upstream) + ";"
    comments += "to:" + str(downstream) + ";"
    comments += "labels:" + ",".join(labels) + ";"

    # -------------------------------------------------------------------------
github dputhier / pygtftk / pygtftk / plugins / tss_numbering.py View on Github external
tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Numbering TSSs.")

    tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')

    gn_how_many_tss = dict()

    for gn_id in gn_to_tx_to_tss:
        for tx_id in gn_to_tx_to_tss[gn_id]:
            tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
            tss_number_file.write(tx_id + "\t" + tss_num + "\n")
            if gn_id not in gn_how_many_tss:
                gn_how_many_tss[gn_id] = tss_num
            else:
                if int(tss_num) > int(gn_how_many_tss[gn_id]):
                    gn_how_many_tss[gn_id] = tss_num

    tss_number_file.close()

    gtf = gtf.add_attr_from_file(feat='transcript',
github dputhier / pygtftk / pygtftk / plugins / get_tx_seq.py View on Github external
as_gz_ext = [True for x in genome if x.name.endswith(".gz")]

    if any(as_gz_ext):
        message("Genome in gz format is not currently supported.", type="ERROR")

    if len(genome) == 1:
        message("Checking fasta file chromosome list")
        genome = genome[0]
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]
    else:
        message("Merging fasta files")
        tmp_genome = make_tmp_file(prefix="genome", suffix=".fa")
        with tmp_genome as tg:
            for curr_file in genome:
                message("Merging %s" % curr_file.name)
                with curr_file as cf:
                    shutil.copyfileobj(cf, tg, 1024 * 1024 * 100)

        message("Checking fasta file chromosome list")
        genome = open(tmp_genome.name, "r")
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]

    rev_comp = not no_rev_comp
github dputhier / pygtftk / pygtftk / plugins / get_ceas_records.py View on Github external
def get_ceas_records(
        inputfile=None,
        outputfile=None,
        show_tables=False,
        target_table='GeneTable'):
    """
    Convert a CEAS sqlite file back into a flat file.
    """

    # ----------------------------------------------------------------------
    # load the CEAS file
    # ----------------------------------------------------------------------

    if inputfile.name.endswith('gz'):
        tmp_file = make_tmp_file(prefix='ceas_gunzip', suffix='.txt')
        with gzip.open(inputfile.name, 'rb') as f_in:
            with open(tmp_file.name, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        inputfile = open(tmp_file.name)

    conn = sqlite3.connect(inputfile.name)
    cursor = conn.cursor()

    # ----------------------------------------------------------------------
    # A func to get the list of tables
    # ----------------------------------------------------------------------

    def get_tables(cursor):

        out_list = list()
        cursor.execute('SELECT name from sqlite_master where type= "table"')