Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_regex_filth(self):
"""make sure RegexDetector only works with RegexFilth"""
class MyFilth(Filth):
pass
class MyDetector(RegexDetector):
filth_cls = MyFilth
text = 'dirty dirty text'
detector = MyDetector()
with self.assertRaises(UnexpectedFilth):
for filth in detector.iter_filth(text):
pass
import re
import textblob
from .base import RegexDetector
from ..filth import NameFilth
from ..utils import CanonicalStringSet
class NameDetector(RegexDetector):
"""Use part of speech tagging to clean proper nouns out of the dirty dirty
``text``. Disallow particular nouns by adding them to the
``NameDetector.disallowed_nouns`` set.
"""
filth_cls = NameFilth
disallowed_nouns = CanonicalStringSet(["skype"])
def iter_filth(self, text):
if not isinstance(self.disallowed_nouns, CanonicalStringSet):
raise TypeError(
'NameDetector.disallowed_nouns must be CanonicalStringSet'
)
# find the set of proper nouns using textblob.
import re
from .base import RegexDetector
from ..filth import EmailFilth
class EmailDetector(RegexDetector):
"""Use regular expression magic to remove email addresses from dirty
dirty ``text``. This method also catches email addresses like ``john at
gmail.com``.
"""
filth_cls = EmailFilth
import re
import nltk
import textblob
from .base import RegexDetector
from ..filth import SkypeFilth
class SkypeDetector(RegexDetector):
"""Skype usernames tend to be used inline in dirty dirty text quite
often but also appear as ``skype: {{SKYPE}}`` quite a bit. This method
looks at words within ``word_radius`` words of "skype" for things that
appear to be misspelled or have punctuation in them as a means to
identify skype usernames.
Default ``word_radius`` is 10, corresponding with the rough scale of
half of a sentence before or after the word "skype" is used. Increasing
the ``word_radius`` will increase the false positive rate and
decreasing the ``word_radius`` will increase the false negative rate.
"""
filth_cls = SkypeFilth
word_radius = 10
def iter_filth(self, text):
from .base import RegexDetector
from ..filth import UrlFilth
class UrlDetector(RegexDetector):
"""Use regular expressions to remove URLs that begin with ``http://``,
``https://`` or ``www.`` from dirty dirty ``text``.
With ``keep_domain=True``, this detector only obfuscates the path on a
URL, not its domain. For example,
``http://twitter.com/someone/status/234978haoin`` becomes
``http://twitter.com/{{replacement}}``.
"""
filth_cls = UrlFilth