How to use the bs4.element.Tag function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flccrakers / dj-tango / bin / crawl-tango-DB / crawl-tango-database-2.py View on Github external
utils.remove_accents(tds[2].a.next).lower(),
				tds[3].a.next,
				utils.remove_accents(tds[3].a.next).lower(),
				tds[4].a.next,
				tds[5].next+tds[5].span.next,
				tds[6].a.next]

				if isinstance(tds[4].a.next, bs4.element.Tag):
					song[7] =''
				if not isinstance(tds[7].a.next, bs4.element.Tag):
					song.append(tds[7].a.next)
				else:
 					song.append('?')
				
				for i, curVal in enumerate(song):
					if isinstance(curVal, bs4.element.Tag):
						song[i] = ''
				#print(song)

				songs.append(song)

			curPage+=1
			url = orchestra = re.sub(r'P=\d+','P='+str(curPage), url)
			print(url)
			fp = urllib.request.urlopen(url)
			soup = BeautifulSoup(fp.read(), "lxml")


	else:
		print("Pas d'enregistrement pour "+url)
	#exit(0);
	return songs;
github Jenyay / outwiker / plugins / webpage / webpage / libs / bs4 / builder / _html5lib.py View on Github external
def appendChild(self, node):
        string_child = child = None
        if isinstance(node, str):
            # Some other piece of code decided to pass in a string
            # instead of creating a TextElement object to contain the
            # string.
            string_child = child = node
        elif isinstance(node, Tag):
            # Some other piece of code decided to pass in a Tag
            # instead of creating an Element object to contain the
            # Tag.
            child = node
        elif node.element.__class__ == NavigableString:
            string_child = child = node.element
            node.parent = self
        else:
            child = node.element
            node.parent = self

        if not isinstance(child, str) and child.parent is not None:
            node.element.extract()

        if (string_child and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
github alibaba / AliOS-Things-Linux-Edition / bitbake / lib / bs4 / builder / _html5lib.py View on Github external
def appendChild(self, node):
        string_child = child = None
        if isinstance(node, str):
            # Some other piece of code decided to pass in a string
            # instead of creating a TextElement object to contain the
            # string.
            string_child = child = node
        elif isinstance(node, Tag):
            # Some other piece of code decided to pass in a Tag
            # instead of creating an Element object to contain the
            # Tag.
            child = node
        elif node.element.__class__ == NavigableString:
            string_child = child = node.element
        else:
            child = node.element

        if not isinstance(child, str) and child.parent is not None:
            node.element.extract()

        if (string_child and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # We are appending a string onto another string.
            # TODO This has O(n^2) performance, for input like
github elgehelge / stocknews / stocknews.py View on Github external
if type(content) is bs4.element.NavigableString:
                    current_node['text'] += "\n" + content
                # Grap content from inline elements
                elif (
                        type(content) is bs4.element.Tag and
                        content.name in text_elements
                      ):
                    current_node['text'] += "\n" + content.text
                # Ignore script tags
                elif (
                        type(content) is bs4.element.Tag and
                        content.name in ignore_elements
                     ):
                    pass
                # Continue traversing the html tree
                elif type(content) is bs4.element.Tag:
                    child_nodes.append(_most_words(content))
            current_node['number_of_words'] = len(re.findall(word_regex,
                                                  current_node['text']))
            current_node['path'] = [] #empty path

            # Select the node containing most words
            # and add the current location to the path
            node_with_most_words = max(child_nodes + [current_node],
                                       key=lambda x:x['number_of_words'])

            node_with_most_words['path'].insert(0, (soup_xml_tag.name, str(soup_xml_tag.attrs)))

            return node_with_most_words
github FindHao / ciba / a_cat.py View on Github external
if len(temp_results[0].find_all('div')) and temp_results[0].div.get('style'):
                self.word.props[''] = temp_results[0].div.text
                return True

        # with open("test", 'w') as fout:
        #     fout.write(base.prettify())
        # todo: 发音的页面结构改变了
        temp_results = base.find_all("div", class_="base-speak")
        if temp_results:
            temp = temp_results[0]
            for node in temp:
                temp1 = ''
                temp2 = ''
                if isinstance(node, bs4.element.Tag):
                    for node1 in node:
                        if not isinstance(node1, bs4.element.Tag):
                            continue
                        if node1.name == 'span':
                            temp1 = node1.text
                        elif node1.name == 'i':
                            temp3 = voice_url_reg.findall(node1['ms-on-mouseover'])
                            if temp3:
                                temp2 = temp3[0]

                    self.word.voices.append((temp1, temp2))
        # 获取基本词义
        print(self.word.voices)
        temp_results = base.find_all('ul', class_='Mean_part__1RA2V')
        # print(temp_results)
        if temp_results:
            print(temp_results)
            meaning_text = ''
github stopstalk / stopstalk-deployment / modules / profilesites.py View on Github external
soup = bs4.BeautifulSoup(t.text)
            table_body = soup.find("tbody")

            # Check if the page retrieved has no submissions
            if len(table_body) == 1:
                return submissions

            row = 0
            submissions[handle][page] = {}
            for i in table_body:
                submissions[handle][page][it] = []
                submission = submissions[handle][page][it]
                append = submission.append

                if isinstance(i, bs4.element.Tag):
                    if row == 0:
                        currid = i.contents[1].contents[0]
                        if currid == previd:
                            flag = 1
                            break
                    row += 1
                    previd = currid

                    # Time of submission
                    tos = i.contents[3].contents[1].contents[0]
                    curr = time.strptime(str(tos), "%Y-%m-%d %H:%M:%S")
                    if curr <= last_retrieved:
                        return submissions
                    append(str(tos))

                    # Problem Name/URL