Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
bioframe.merge(df1),
check_dtype=False,
check_exact=True,
)
# test on=['chrom',...] argument
df1 = pd.DataFrame(
[
["chr1", 3, 8, "+", "cat", 5.5],
["chr1", 3, 8, "-", "dog", 6.5],
["chr1", 6, 10, "-", "cat", 6.5],
["chrX", 6, 10, "-", "cat", 6.5],
],
columns=["chrom", "start", "end", "strand", "animal", "location"],
)
assert len(bioframe.merge(df1, on=None)) == 2
assert len(bioframe.merge(df1, on=["strand"])) == 3
assert len(bioframe.merge(df1, on=["strand", "location"])) == 3
assert len(bioframe.merge(df1, on=["strand", "location", "animal"])) == 4
d = """ chrom start end animal n_intervals
0 chr1 3 10 cat 2
1 chr1 3 8 dog 1
2 chrX 6 10 cat 1"""
df = pd.read_csv(StringIO(d), sep=r"\s+")
pd.testing.assert_frame_equal(
df, bioframe.merge(df1, on=["animal"]), check_dtype=False,
)
df1 = pd.DataFrame(
[["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14],],
columns=["chrom", "start", "end"],
)
# the last interval does not overlap the first three with default min_dist=0
assert (bioframe.merge(df1)["n_intervals"].values == np.array([3, 1])).all()
# adjacent intervals are not clustered with min_dist=none
assert (
bioframe.merge(df1, min_dist=None)["n_intervals"].values == np.array([2, 1, 1])
).all()
# all intervals part of one cluster
assert (
bioframe.merge(df1, min_dist=2)["n_intervals"].values == np.array([4])
).all()
df1.iloc[0, 0] = "chrX"
assert (
bioframe.merge(df1, min_dist=None)["n_intervals"].values
== np.array([1, 1, 1, 1])
).all()
assert (
bioframe.merge(df1, min_dist=0)["n_intervals"].values == np.array([2, 1, 1])
).all()
# total number of intervals should equal length of original dataframe
mock_df = mock_bioframe()
assert np.sum(bioframe.merge(mock_df, min_dist=0)["n_intervals"].values) == len(
mock_df
)
def test_merge():
df1 = pd.DataFrame(
[["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14],],
columns=["chrom", "start", "end"],
)
# the last interval does not overlap the first three with default min_dist=0
assert (bioframe.merge(df1)["n_intervals"].values == np.array([3, 1])).all()
# adjacent intervals are not clustered with min_dist=none
assert (
bioframe.merge(df1, min_dist=None)["n_intervals"].values == np.array([2, 1, 1])
).all()
# all intervals part of one cluster
assert (
bioframe.merge(df1, min_dist=2)["n_intervals"].values == np.array([4])
).all()
df1.iloc[0, 0] = "chrX"
assert (
bioframe.merge(df1, min_dist=None)["n_intervals"].values
== np.array([1, 1, 1, 1])
).all()
assert (
bioframe.merge(df1, min_dist=2)["n_intervals"].values == np.array([4])
).all()
df1.iloc[0, 0] = "chrX"
assert (
bioframe.merge(df1, min_dist=None)["n_intervals"].values
== np.array([1, 1, 1, 1])
).all()
assert (
bioframe.merge(df1, min_dist=0)["n_intervals"].values == np.array([2, 1, 1])
).all()
# total number of intervals should equal length of original dataframe
mock_df = mock_bioframe()
assert np.sum(bioframe.merge(mock_df, min_dist=0)["n_intervals"].values) == len(
mock_df
)
# test consistency with pyranges
pd.testing.assert_frame_equal(
pyranges_to_bioframe(bioframe_to_pyranges(df1).merge(count=True)),
bioframe.merge(df1),
check_dtype=False,
check_exact=True,
)
# test on=['chrom',...] argument
df1 = pd.DataFrame(
[
["chr1", 3, 8, "+", "cat", 5.5],
["chr1", 3, 8, "-", "dog", 6.5],
== np.array([1, 1, 1, 1])
).all()
assert (
bioframe.merge(df1, min_dist=0)["n_intervals"].values == np.array([2, 1, 1])
).all()
# total number of intervals should equal length of original dataframe
mock_df = mock_bioframe()
assert np.sum(bioframe.merge(mock_df, min_dist=0)["n_intervals"].values) == len(
mock_df
)
# test consistency with pyranges
pd.testing.assert_frame_equal(
pyranges_to_bioframe(bioframe_to_pyranges(df1).merge(count=True)),
bioframe.merge(df1),
check_dtype=False,
check_exact=True,
)
# test on=['chrom',...] argument
df1 = pd.DataFrame(
[
["chr1", 3, 8, "+", "cat", 5.5],
["chr1", 3, 8, "-", "dog", 6.5],
["chr1", 6, 10, "-", "cat", 6.5],
["chrX", 6, 10, "-", "cat", 6.5],
],
columns=["chrom", "start", "end", "strand", "animal", "location"],
)
assert len(bioframe.merge(df1, on=None)) == 2
assert len(bioframe.merge(df1, on=["strand"])) == 3
["chr1", 6, 10, "-", "cat", 6.5],
["chrX", 6, 10, "-", "cat", 6.5],
],
columns=["chrom", "start", "end", "strand", "animal", "location"],
)
assert len(bioframe.merge(df1, on=None)) == 2
assert len(bioframe.merge(df1, on=["strand"])) == 3
assert len(bioframe.merge(df1, on=["strand", "location"])) == 3
assert len(bioframe.merge(df1, on=["strand", "location", "animal"])) == 4
d = """ chrom start end animal n_intervals
0 chr1 3 10 cat 2
1 chr1 3 8 dog 1
2 chrX 6 10 cat 1"""
df = pd.read_csv(StringIO(d), sep=r"\s+")
pd.testing.assert_frame_equal(
df, bioframe.merge(df1, on=["animal"]), check_dtype=False,
)