How to use urlscan - 10 common examples

To help you get started, we’ve selected a few urlscan examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github firecat53 / urlscan / urlscan / urlscan.py View on Github external
if data:
            if data[0].isspace():
                self.trailing_space = True
            if data[-1].isspace():
                future_trailing_space = True
        data = ' '.join(data.split())
        if self.anchor_stack[-1] is None:
            style = 'msgtext'
        else:
            style = 'anchor'
        if self.style_stack[-1]:
            stylelist = list(self.style_stack[-1])
            stylelist.sort()
            style = style + ':' + ''.join(stylelist)

        self.add_chunk(Chunk((style, data), self.cur_url()))
        self.trailing_space = future_trailing_space
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def end_para(self):
        if self.at_para_start:
            self.rval.append([])
        else:
            self.at_para_start = True
        self.trailing_space = False
        if self.list_stack:
            self.add_chunk(Chunk(' ' * 3 * len(self.list_stack),
                                 self.cur_url()))
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def end_list_para(self):
        if self.at_para_start:
            self.rval.append([])
        if self.list_stack:
            tag = self.list_stack[-1][0]
            if tag == 'ul':
                depth = len([t for t in self.list_stack if t[0] == tag])
                ul_tags = HTMLChunker.ul_tags
                chunk = Chunk('%s  ' % (ul_tags[depth % len(ul_tags)]),
                              self.cur_url())
            else:
                counter = self.list_stack[-1][1]
                self.list_stack[-1] = (tag, counter + 1)
                chunk = Chunk("%2d." % counter, self.cur_url())
            self.add_chunk(chunk)
        else:
            self.end_para()
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
loc = 0

    for match in URLRE.finditer(mesg):
        if loc < match.start():
            rval.append(Chunk(mesg[loc:match.start()], None))
        # Turn email addresses into mailto: links
        email = match.group("email")
        if email and "mailto" not in email:
            mailto = "mailto:{}".format(email)
        else:
            mailto = match.group(1)
        rval.append(Chunk(None, mailto))
        loc = match.end()

    if loc < len(mesg):
        rval.append(Chunk(mesg[loc:], None))

    return rval
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def add_chunk(self, chunk):
        if self.at_para_start:
            self.rval.append([])
        elif self.trailing_space:
            self.rval[-1].append(Chunk(' ', self.cur_url()))

        self.rval[-1].append(chunk)
        self.at_para_start = False
        self.trailing_space = False
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
components."""

    rval = []

    loc = 0

    for match in URLRE.finditer(mesg):
        if loc < match.start():
            rval.append(Chunk(mesg[loc:match.start()], None))
        # Turn email addresses into mailto: links
        email = match.group("email")
        if email and "mailto" not in email:
            mailto = "mailto:{}".format(email)
        else:
            mailto = match.group(1)
        rval.append(Chunk(None, mailto))
        loc = match.end()

    if loc < len(mesg):
        rval.append(Chunk(mesg[loc:], None))

    return rval
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def extracthtmlurls(mesg):
    """Extract URLs with context from html type message. Similar to extracturls.

    """
    chunk = HTMLChunker()
    chunk.feed(mesg)
    chunk.close()
    # above_context = 1
    # below_context = 1

    def somechunkisurl(chunks):
        for chnk in chunks:
            if chnk.url is not None:
                return True
        return False

    return extract_with_context(chunk.rval, somechunkisurl, 1, 1)
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def handle_entityref(self, name):
        if name in HTMLChunker.entities:
            self.handle_data(HTMLChunker.entities[name])
        else:
            # If you see a reference, it needs to be
            # added above.
            self.handle_data('&%s;' % name)
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def end_list_para(self):
        if self.at_para_start:
            self.rval.append([])
        if self.list_stack:
            tag = self.list_stack[-1][0]
            if tag == 'ul':
                depth = len([t for t in self.list_stack if t[0] == tag])
                ul_tags = HTMLChunker.ul_tags
                chunk = Chunk('%s  ' % (ul_tags[depth % len(ul_tags)]),
                              self.cur_url())
            else:
                counter = self.list_stack[-1][1]
                self.list_stack[-1] = (tag, counter + 1)
                chunk = Chunk("%2d." % counter, self.cur_url())
            self.add_chunk(chunk)
        else:
            self.end_para()
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def handle_starttag(self, tag, attrs):
        if tag == 'a':
            self.anchor_stack.append(self.findattr(attrs, 'href'))
        elif tag in ('ul', 'ol'):
            self.list_stack.append((tag, 1))
            self.end_para()
        elif tag in HTMLChunker.tag_styles:
            self.style_stack.append(self.style_stack[-1] |
                                    set([HTMLChunker.tag_styles[tag]]))
        elif isheadertag(tag):
            self.style_stack.append(self.style_stack[-1] | set(['bold']))
        elif tag in ('p', 'br'):
            self.end_para()
        elif tag == 'img':
            # Since we expect HTML *email*, image links
            # should be external (naja?)
            alt = self.findattr(attrs, 'alt')
            if alt is None:
                alt = '[IMG]'
            src = self.findattr(attrs, 'src')
            if src is not None and not src.startswith(('http://', 'https://')):
                src = None