Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_error_data(html: str) -> Optional[str]:
"""Get error message from a request."""
soup = bs4.BeautifulSoup(
html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form")
)
# Attempt to extract and format the error string
return " ".join(list(soup.stripped_strings)[1:3]) or None
def get_submissions(self, specified_langs):
submurl = 'https://leetcode.com/submissions/'
strainer = SoupStrainer('tbody')
memory = defaultdict(dict)
for i in itertools.count(1):
url = urljoin(submurl, str(i))
soup = self.get_soup(url, strainer)
rowlist = soup.find_all('tr')
if rowlist == []:
break
eachpage = defaultdict(dict)
for row in rowlist:
_, title, status, _, lang = list(row.stripped_strings)
if status == 'Accepted':
title = title.replace(' ', '_')
if not memory[title].get(lang):
memory[title][lang] = urljoin(self.BASEURL, row.find_all('a')[1]['href'])
eachpage[title][lang] = memory[title][lang]
info = []
def httpurlstates(y, req):
try:
wadresults.setdefault(y, [])
parse = BeautifulSoup(req.content, 'html.parser', parse_only=SoupStrainer('meta'))
for link in parse:
if link.has_attr('name'):
if 'generator' in link['name']:
wadresults[y].append(link['content'])
else:
None
for x in headers:
if x in req.headers:
value = req.headers.get(x)
wadresults[y].append(value)
else:
wadresults[y].append('')
try:
#print('Start to read') DEBUG
siteData = urllib2.urlopen(url)
#print('Done reading.') DEBUG
except urllib2.HTTPError, e:
print(e.code)
except urllib2.URLError, e:
print(e.args)
# This is the default value that will be returned if nothing is found.
result = 'Found nothing.'
# Actually parse and find the text
if siteData is not None:
# Use SoupStrainer to only parse what I need
tagsWithClass = SoupStrainer('p',{'class': 'refpurpose'})
#print('Done creating SoupStrainer.') DEBUG
# Create the soup object, using the SoupStrainer.
# This is what takes the most time (hence the .txt-file cache)
soup = BeautifulSoup(siteData, "lxml", parse_only=tagsWithClass)
#print('Done creating BeautifulSoup.') DEBUG
# Get the specific tag I need
shortDescrPtag = soup.find("p", { "class" : "refpurpose" })
#print('Done finding tag.') DEBUG
try:
# Put the text without html tags in my fancy string
result = 'PHP-manualen: ' + shortDescrPtag.get_text() + ' - ' + url
def get_standard():
standard_url = 'https://www.standardmedia.co.ke/business/category/19/business-news'
if check_connection(standard_url):
standard = requests.get(standard_url)
soup = BeautifulSoup(standard.text, 'lxml', parse_only=SoupStrainer('div'))
standard = []
for link in soup.select('h4 a', limit=14):
if link.get_text():
news_title = '{}({})'.format(link.get_text().strip(), link.get('href'))
standard_link = requests.get(link.get('href'))
soup_link = BeautifulSoup(standard_link.text, 'lxml', parse_only=SoupStrainer(['script']))
article_date = 0
content = ''
image = ''
try:
data = json.loads(soup_link.find('script', type='application/ld+json').text.replace("\\", r"\\"))
article_date = data['dateModified']
content = data['description']
image = data['image']['url']
if image == 'https://www.standardmedia.co.ke':
image = ''
print(image)
except ValueError:
print('Standard: invalid json detected')
continue
news_dict = {
action='store_true',
)
parser.add_argument(
'--filename',
help="filename to be saved.",
default="gallery.txt"
)
self.options, _ = parser.parse_known_args()
self._session = requests.Session()
self._markup = markup
self._view_url = 'http://gall.dcinside.com/board/view'
self._comment_view_url = 'http://gall.dcinside.com/board/view'
self._current_post_id = self.options.init_post_id
self._strainer = SoupStrainer('div', attrs={'class': [
're_gall_top_1', # 제목, 글쓴이, 작성시각
'btn_recommend', # 추천, 비추천
'gallery_re_title', # 댓글
's_write', # 본문
]})
# Custom header is required in order to request.
self.header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
def _get_team_links(base_url, table_id):
links = SoupStrainer('table', {'id': table_id})
return BeautifulSoup(requests.get(base_url).content, 'html.parser', parse_only=links)