Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m_error = re.search(
r'<div class="block-error">\s*<div class="heading">\s*<div>(?P.+?)</div>\s*</div>', webpage)
if m_error:
raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
video_title = None
duration = None
video_thumbnail = None
formats = []
# most of the information is stored in the flashvars
flashvars = self._html_search_regex(
r'flashvars="(.+?)"', webpage, 'flashvars')
infos = compat_urllib_parse_unquote(flashvars).split(r'&')
for info in infos:
videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
if videovars_match:
key = videovars_match.group(1)
val = videovars_match.group(2)</div>
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
return res
if cc_json:
for closed_caption in cc_json:
lang = closed_caption['lang']
if lang not in subtitles:
subtitles[lang] = []
subtitles[lang].append({
'url': closed_caption['url'],
'ext': mimetype2ext(closed_caption['content_type']),
})
return {
'id': video_id,
'display_id': display_id,
'title': unescapeHTML(meta['title']),
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
'duration': int_or_none(meta.get('duration')),
'subtitles': subtitles,
}
post_id = self._match_id(url)
wall_url = 'https://vk.com/wall%s' % post_id
post_id = remove_start(post_id, '-')
webpage = self._download_webpage(wall_url, post_id)
error = self._html_search_regex(
r'>Error\s*]+class=["\']body["\'][^>]*>([^<]+)',
webpage, 'error', default=None)
if error:
raise ExtractorError('VK said: %s' % error, expected=True)
description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class('author', webpage))
thumbnail = self._og_search_thumbnail(webpage)
entries = []
audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
if audio_ids:
al_audio = self._download_webpage(
'https://vk.com/al_audio.php', post_id,
note='Downloading audio info', fatal=False,
data=urlencode_postdata({
'act': 'reload_audio',
'al': '1',
'ids': ','.join(audio_ids)
}))
if al_audio:
Audio = collections.namedtuple(
def _real_extract(self, url):
album_id = self._match_id(url)
webpage = self._download_webpage(
url, album_id, note='Download album info',
errnote='Unable to get album info')
album_name = self._html_search_regex(
r']+class="comm"[^<]+]+title="([^"]+)"', webpage,
'album name')
album_intro = remove_start(
clean_html(get_element_by_id('intro', webpage)),
'%s简介:' % album_name)
entries = [
self.url_result(song_url, 'Kuwo') for song_url in re.findall(
r']+class="listen">]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
webpage)
]
return self.playlist_result(entries, album_id, album_name, album_intro)
login_form.update({
'username': username,
'password': password,
})
post_url = urljoin(self._LOGIN_URL, self._search_regex(
r']+action=(["\'])(?P.+?)\1', login_page,
'post url', default=self._LOGIN_URL, group='url'))
response, urlh = self._download_webpage_handle(
post_url, None, 'Logging in', data=urlencode_postdata(login_form),
headers={'Referer': self._LOGIN_URL})
if self._LOGIN_URL in urlh.geturl():
error = clean_html(get_element_by_class('form-message', response))
if error:
raise ExtractorError(
'Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
video_title = self._html_search_regex(
r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage,
'title', default=None)
if not video_title:
video_title = self._html_search_regex(
r'(?s)<span class="fbPhotosPhotoCaption"><span class="hasCaption">(.*?)</span>',
webpage, 'alternative title', default=None)
if not video_title:
video_title = self._html_search_meta(
'description', webpage, 'title', default=None)
if video_title:
video_title = limit_length(video_title, 80)
else:
video_title = 'Facebook video #%s' % video_id
uploader = clean_html(get_element_by_id(
'fbPhotoPageAuthorName', webpage)) or self._search_regex(
r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
timestamp = int_or_none(self._search_regex(
r']+data-utime=["\'](\d+)', webpage,
'timestamp', default=None))
thumbnail = self._og_search_thumbnail(webpage)
info_dict = {
'id': video_id,
'title': video_title,
'formats': formats,
'uploader': uploader,
'timestamp': timestamp,
'thumbnail': thumbnail,
}
</span>
if subtitle:
title += ' - %s' % subtitle
title = title.strip()
subtitles = {}
subtitles_list = [{
'url': subformat['url'],
'ext': subformat.get('format'),
} for subformat in info.get('subtitles', []) if subformat.get('url')]
if subtitles_list:
subtitles['fr'] = subtitles_list
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': clean_html(info['synopsis']),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
}