Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_tokenize_one_hot():
assert one_hot("ACG", DNA, "N").shape == (3, 4)
et = tokenize("ACG", DNA, "N")
assert et.shape == (3,)
assert np.array_equal(et, np.array([0, 1, 2]))
et = tokenize("TGTN", DNA, "N")
assert np.array_equal(et, np.array([3, 2, 3, -1])) # N mapped to -1
def test_tokenize():
assert np.all(tokenize("ACGTTA", DNA, neutral_alphabet="N")
== [0, 1, 2, 3, 3, 0])
assert np.all(
tokenize("ACGTGATGA", ["ACG", "TGA"], neutral_alphabet="NNN") == [0, 1, 1])
assert np.all(
tokenize("ACGTGATGA", ["ACG"], neutral_alphabet="TGA") == [0, -1, -1])
with pytest.raises(Exception):
tokenize("ACGTGATGA", ["ACG"], neutral_alphabet="NNN")
def tokenize(seq, alphabet=DNA, neutral_alphabet=["N"]):
"""Convert sequence to integers
# Arguments
seq: Sequence to encode
alphabet: Alphabet to use
neutral_alphabet: Neutral alphabet -> assign those values to -1
# Returns
List of length `len(seq)` with integers from `-1` to `len(alphabet) - 1`
"""
# Req: all alphabets have the same length
if isinstance(neutral_alphabet, str):
neutral_alphabet = [neutral_alphabet]
nchar = len(alphabet[0])
for l in alphabet + neutral_alphabet:
def __init__(self, alphabet=DNA, neutral_alphabet='N', neutral_value=0.25, dtype=None):
self.alphabet = alphabet
if isinstance(neutral_alphabet, str):
neutral_alphabet = [neutral_alphabet]
self.neutral_alphabet = neutral_alphabet
self.neutral_value = neutral_value
self.dtype = dtype
def one_hot2string(arr, alphabet=DNA):
"""Convert a one-hot encoded array back to string
"""
tokens = one_hot2token(arr)
indexToLetter = _get_index_dict(alphabet)
return [''.join([indexToLetter[x] for x in row]) for row in tokens]
def __call__(self, seq):
if self.alphabet == DNA and self.neutral_alphabet == ['N'] and self.neutral_value == 0.25:
return F.one_hot_dna(seq, self.dtype)
else:
return F.one_hot(seq,
alphabet=self.alphabet,
neutral_alphabet=self.neutral_alphabet,
neutral_value=self.neutral_value,
dtype=self.dtype)
def one_hot_dna(seq, dtype=None):
"""One-hot encode DNA sequence
"""
if not isinstance(seq, str):
raise ValueError("seq needs to be a string")
if one_hot_encode_sequence is not None:
# genomelake's one_hot_encode_sequence could be imported
out = np.zeros((len(seq), 4), dtype=np.float32)
one_hot_encode_sequence(seq, out)
return out.astype(dtype)
else:
return one_hot(seq, alphabet=DNA, neutral_alphabet=['N'], neutral_value=.25, dtype=dtype)