Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
class LinksExtractor(BaseExtractor):
def extract(self, url):
links = []
items = self.parser.getElementsByTag(self.article._raw_doc, 'a')
for i in items:
href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
if attr:
links.append(attr)
return links
class RobotsExtractor(BaseExtractor):
def extract(self):
robots = []
kwargs = {'tag': 'meta', 'attr': 'name', 'value': 'robots'}
items = self.parser.getElementsByTag(self.article._raw_doc, **kwargs)
for i in items:
attr = self.parser.getAttribute(i, 'content')
if attr and len(attr):
attr = [a.strip().lower() for a in attr.split(',')]
robots.extend(attr)
return robots
def crawl_url(url):
g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from goose3 import Goose
from goose3.extractors import BaseExtractor
from lib.utils import *
import config as cfg
class LinksExtractor(BaseExtractor):
def extract(self, url):
links = []
items = self.parser.getElementsByTag(self.article._raw_doc, 'a')
for i in items:
href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
if attr:
links.append(attr)
return links
class RobotsExtractor(BaseExtractor):
def extract(self):