Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
bed, i = {}, 0
start = timer()
with open(argv[1]) as fp:
for line in fp:
t = line[:-1].split("\t")
if not t[0] in bed:
bed[t[0]] = [[], [], [], None]
bed[t[0]][0].append(t[1])
bed[t[0]][1].append(t[2])
bed[t[0]][2].append(i)
i += 1
sys.stderr.write("Read in {} sec\n".format(timer() - start))
start = timer()
for ctg in bed:
bed[ctg][3] = NCLS(np.array(bed[ctg][0], dtype=np.long), np.array(bed[ctg][1], dtype=np.long), np.array(bed[ctg][2], dtype=np.long))
sys.stderr.write("Index in {} sec\n".format(timer() - start))
start = timer()
with open(argv[2]) as fp:
for line in fp:
t = line[:-1].split("\t")
if not t[0] in bed:
print("{}\t{}\t{}\t0".format(t[0], t[1], t[2]))
else:
cnt = 0
it = bed[t[0]][3].find_overlap(long(t[1]), long(t[2]))
for r in it: cnt += 1
print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], cnt))
sys.stderr.write("Query in {} sec\n".format(timer() - start))
from ncls import NCLS
import pickle
import pandas as pd
import numpy as np
starts = np.array(list(reversed([3, 5, 8])), dtype=np.long)
ends = np.array(list(reversed([6, 7, 9])), dtype=np.long)
indexes = np.array(list(reversed([0, 1, 2])), dtype=np.long)
# starts = np.array([3, 5, 8], dtype=np.long)
# ends = np.array([6, 7, 9], dtype=np.long)
# indexes = np.array([0, 1, 2], dtype=np.long)
ncls = NCLS(starts, ends, indexes)
starts2 = np.array([1, 6])
ends2 = np.array([10, 7])
indexes2 = np.array([0, 1])
print(ncls.all_overlaps_both(starts2, ends2, indexes2))
from ncls import NCLS
import pickle
import pandas as pd
import numpy as np
starts = np.random.randint(0, int(1e8), int(1e7))
ends = starts + 100
ids = starts
ncls = NCLS(starts, ends, ids)
for i in ncls.find_overlap(0, 2):
print(i)
pickle.dump(ncls, open("test.pckl", "wb"))
import pickle
ncls2 = pickle.load(open("test.pckl", "rb"))
for i in ncls2.find_overlap(0, 2):
print(i)
def test_ncls():
# ids = starts
print(starts, ends, ids)
ncls = NCLS(starts, ends, ids)
print(ncls)
print(ncls.intervals())
assert list(ncls.find_overlap(0, 2)) == []
print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806)))
assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)]
r, l = ncls.all_overlaps_both(starts, ends, ids)
assert list(r) == [2147483647, 3]
assert list(l) == [2147483647, 3]
def _overlap(scdf, ocdf, **kwargs):
invert = kwargs["invert"]
return_indexes = kwargs.get("return_indexes", False)
if scdf.empty or ocdf.empty:
return None
how = kwargs["how"]
assert how in "containment first".split() + [False, None]
starts = scdf.Start.values
ends = scdf.End.values
indexes = scdf.index.values
it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)
if not how:
_indexes = it.all_overlaps_self(starts, ends, indexes)
elif how == "containment":
_indexes, _ = it.all_containments_both(starts, ends, indexes)
else:
_indexes = it.has_overlaps(starts, ends, indexes)
if invert:
_indexes = scdf.index.difference(_indexes)
if return_indexes:
return _indexes
return scdf.reindex(_indexes)
def _number_overlapping(scdf, ocdf, **kwargs):
keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True)
if scdf.empty:
return None
if ocdf.empty:
if keep_nonoverlapping:
df = scdf.copy()
df.insert(df.shape[1], "NumberOverlaps", 0)
return df
else:
return None
oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)
starts = scdf.Start.values
ends = scdf.End.values
indexes = scdf.index.values
_self_indexes, _other_indexes = oncls.all_overlaps_both(
starts, ends, indexes)
s = pd.Series(_self_indexes)
counts_per_read = s.value_counts()[s.unique()].reset_index()
counts_per_read.columns = ["Index", "Count"]
df = scdf.copy()
if keep_nonoverlapping:
_missing_indexes = np.setdiff1d(scdf.index, _self_indexes)
def create_ncls(df):
return NCLS(df.Start.values, df.End.values, df.index.values)
def _subtraction(scdf, ocdf, **kwargs):
if ocdf.empty or scdf.empty:
return scdf
strandedness = kwargs["strandedness"]
strand = True if strandedness else False
chromosome = scdf.Chromosome.head(1).iloc[0]
kwargs["chromosome"] = chromosome
if "Strand" in ocdf and strand:
strand = scdf.Strand.head(1).iloc[0]
kwargs["strand"] = strand
o = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)
idx_self, new_starts, new_ends = o.set_difference_helper(
scdf.Start.values, scdf.End.values, scdf.index.values)
missing_idx = pd.Index(scdf.index).difference(idx_self)
idx_to_drop = new_starts != -1
new_starts = new_starts[idx_to_drop]
new_ends = new_ends[idx_to_drop]
idx_self = idx_self[idx_to_drop]
new_starts = pd.Series(new_starts, index=idx_self)
new_ends = pd.Series(new_ends, index=idx_self)
scdf = scdf.reindex(missing_idx.union(idx_self)).sort_index()
def NCLS(starts, ends, ids):
if starts.dtype == np.int64:
return NCLS64(starts, ends, ids)
elif starts.dtype == np.int32:
return NCLS32(starts, ends, ids)
else:
raise Exception("Starts/Ends not int64 or int32: " + str(starts.dtype))
def NCLS(starts, ends, ids):
if starts.dtype == np.int64:
return NCLS64(starts, ends, ids)
elif starts.dtype == np.int32:
return NCLS32(starts, ends, ids)
else:
raise Exception("Starts/Ends not int64 or int32: " + str(starts.dtype))