Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
df_annotated = bioframe.cluster(df1)
assert (
df_annotated["cluster"].values == np.array([0, 0, 0, 1])
).all() # the last interval does not overlap the first three
df_annotated = bioframe.cluster(df1, min_dist=2)
assert (
df_annotated["cluster"].values == np.array([0, 0, 0, 0])
).all() # all intervals part of the same cluster
df_annotated = bioframe.cluster(df1, min_dist=None)
assert (
df_annotated["cluster"].values == np.array([0, 0, 1, 2])
).all() # adjacent intervals not clustered
df1.iloc[0, 0] = "chrX"
df_annotated = bioframe.cluster(df1)
assert (
df_annotated["cluster"].values == np.array([2, 0, 0, 1])
).all() # do not cluster intervals across chromosomes
# test consistency with pyranges (which automatically sorts df upon creation and uses 1-based indexing for clusters)
assert (
(bioframe_to_pyranges(df1).cluster(count=True).df["Cluster"].values - 1)
== bioframe.cluster(df1.sort_values(["chrom", "start"]))["cluster"].values
).all()
# test on=[] argument
df1 = pd.DataFrame(
[
["chr1", 3, 8, "+", "cat", 5.5],
["chr1", 3, 8, "-", "dog", 6.5],
["chr1", 6, 10, "-", "cat", 6.5],
def test_cluster():
df1 = pd.DataFrame(
[["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14],],
columns=["chrom", "start", "end"],
)
df_annotated = bioframe.cluster(df1)
assert (
df_annotated["cluster"].values == np.array([0, 0, 0, 1])
).all() # the last interval does not overlap the first three
df_annotated = bioframe.cluster(df1, min_dist=2)
assert (
df_annotated["cluster"].values == np.array([0, 0, 0, 0])
).all() # all intervals part of the same cluster
df_annotated = bioframe.cluster(df1, min_dist=None)
assert (
df_annotated["cluster"].values == np.array([0, 0, 1, 2])
).all() # adjacent intervals not clustered
df1.iloc[0, 0] = "chrX"
df_annotated = bioframe.cluster(df1)
assert (
df_annotated["cluster"].values == np.array([2, 0, 0, 1])
).all() # do not cluster intervals across chromosomes
# test consistency with pyranges (which automatically sorts df upon creation and uses 1-based indexing for clusters)
assert (
(bioframe_to_pyranges(df1).cluster(count=True).df["Cluster"].values - 1)
== bioframe.cluster(df1.sort_values(["chrom", "start"]))["cluster"].values
).all()
df_annotated = bioframe.cluster(df1, min_dist=None)
assert (
df_annotated["cluster"].values == np.array([0, 0, 1, 2])
).all() # adjacent intervals not clustered
df1.iloc[0, 0] = "chrX"
df_annotated = bioframe.cluster(df1)
assert (
df_annotated["cluster"].values == np.array([2, 0, 0, 1])
).all() # do not cluster intervals across chromosomes
# test consistency with pyranges (which automatically sorts df upon creation and uses 1-based indexing for clusters)
assert (
(bioframe_to_pyranges(df1).cluster(count=True).df["Cluster"].values - 1)
== bioframe.cluster(df1.sort_values(["chrom", "start"]))["cluster"].values
).all()
# test on=[] argument
df1 = pd.DataFrame(
[
["chr1", 3, 8, "+", "cat", 5.5],
["chr1", 3, 8, "-", "dog", 6.5],
["chr1", 6, 10, "-", "cat", 6.5],
["chrX", 6, 10, "-", "cat", 6.5],
],
columns=["chrom", "start", "end", "strand", "animal", "location"],
)
assert (
bioframe.cluster(df1, on=["animal"])["cluster"].values == np.array([0, 1, 0, 2])
).all()
assert (
# test on=[] argument
df1 = pd.DataFrame(
[
["chr1", 3, 8, "+", "cat", 5.5],
["chr1", 3, 8, "-", "dog", 6.5],
["chr1", 6, 10, "-", "cat", 6.5],
["chrX", 6, 10, "-", "cat", 6.5],
],
columns=["chrom", "start", "end", "strand", "animal", "location"],
)
assert (
bioframe.cluster(df1, on=["animal"])["cluster"].values == np.array([0, 1, 0, 2])
).all()
assert (
bioframe.cluster(df1, on=["strand"])["cluster"].values == np.array([0, 1, 1, 2])
).all()
assert (
bioframe.cluster(df1, on=["location", "animal"])["cluster"].values
== np.array([0, 2, 1, 3])
).all()