Scrapy: ползать угловые ссылки?

Я использую selenium-webdriver для рендеринга javascript для сканера scrapy, но не похоже, что ссылки angularjs 'ng-href' сканируются. Скрипт сканирует ссылки 'ng-href'? Если нет, как я могу заставить его сканировать ссылки 'ng-href'?

from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender

from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse

class HomeSpider(CrawlSpider):
    name = "capseleniums"
    allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
    handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]

    def start_requests(self):
        start_urls = reversed( [
            'http://wm12.ecommerce.com/health-wellness-center/',
            'http://wm13.ecommerce.com/Cook/',
            'http://wm15.ecommerce.com/electronics-resource-center/',
            'http://health.ecommerce.com/vitamins-wellness-center/',
            'http://learn.ecommerce.com/Tips-Ideas/',
            ] )
        return [ Request(url = start_url) for start_url in start_urls ]

    def trim(link_text):
        return link_text.strip(' \t\n\r')

    rules = (
        Rule(
            LinkExtractor(
                allow=(),
                deny=(),
                process_value=trim,
                ),
                callback="parse_items",
                follow=False,),
    )

    def __init__(self, category=None, *args, **kwargs):
        self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
        super(HomeSpider, self).__init__(*args, **kwargs)

    def __del__(self):
       self.driver.stop()

    def parse_items(self, response):
        hxs = self.driver
        hxs.get(response.url)
        time.sleep(1)
        body = hxs.page_source
        sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
        hxs = Selector(sel_response)
        sites = hxs.xpath('//html')
        items = []

        if response.status == 404:
            for site in sites:
                item = Website()
                item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                item['referer'] = response.request.headers.get('Referer')
                item['status'] = response.status
                items.append(item)

            return items

        if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
            for site in sites:
                item = Website()
                item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                item['referer'] = response.request.headers.get('Referer')
                item['status'] = response.status
                items.append(item)

            return items

        elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
            for site in sites:
                item = Website()
                item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                item['referer'] = response.request.headers.get('Referer')
                item['status'] = response.status
                items.append(item)

            return items

        else:
            if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
                for site in sites:
                    item = Website()
                    item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                    item['referer'] = response.request.headers.get('Referer')
                    item['status'] = response.status
                    items.append(item)

                return items

1 ответ

Решение

По умолчанию он будет искать ссылки в href атрибут a а также area теги.

Вам просто нужно дополнительно настроить attrs аргумент и включить ng-href атрибут:

LinkExtractor(attrs=['href', 'ng-href'], callback="parse_items", follow=False),
Другие вопросы по тегам