Сканирование Scrapy прекращается слишком рано и получение URL-адресов вне выражения регулярного выражения

Цель: Очистить сайт стенограммы мыльной оперы, чтобы получить корпус стенограмм. Стенограммы находятся на страницах в форме http://tvmegasite.net/transcripts/amc/main/2001transcripts.shtml и имеют регулярное выражение:

(https?:\/\/?tvmegasite\.net\/transcripts\/\w+\/main\/\d+\w+\.\w+)

1) Вопрос 1: из результата № 1 ясно, что я недостаточно сделал, чтобы перейти по ссылкам, чтобы перейти к этапу 4 (то есть, чтобы получить что-то похожее на http://tvmegasite.net/transcripts/theshow/main/,) который будет содержать нужные страницы стенограммы формы регулярного выражения выше. Паук прекращает переходить по ссылкам в какой-то момент вместо того, чтобы продолжать до конца. Так каков наилучший способ сохранить следующие ссылки, пока я не доберусь до нужной формы?

2) Вопрос 2. Когда я попытался проверить, насколько хорошо работает скребок, несмотря на это, например, начав очистку по адресу http://tvmegasite.net/amc/main, я не получаю список только HTML стенограммы, но множество других ссылок, многие из которых не удовлетворяют выражению регулярного выражения. Я знаю, что мое регулярное выражение верно, так что происходит?

Мой код:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from soapoperascrape.items import SoapoperascrapeItem

class SoapOperaSpider(CrawlSpider):
    name="S_O_"
    allowed_domains=["tvmegasite.net"]
    start_urls=["http://tvmegasite.net/transcripts"]
    rules=(Rule(LinkExtractor(allow=('https?://tvmegasite\.net/transcripts/\w+/main/\d+\w+\.\w+', )), callback='parse_dir_contents'),)

    def parse(self, response):
        hxs=Selector(response)
        for href in hxs.xpath('/html/body/pre/a/@href'):
            url=response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        for sel in response.xpath('/html/body/pre'):
            item=SoapoperascrapeItem()
            item['link']=sel.xpath('a/@href').extract()
            yield item

Результат № 1:

[{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "blueprnt/", "clearday/", "dpawin/", "expeditn/", "nature/", "strtedge/", "sumipntg/", "themes.inf", "tp-dis4/", "tp-mod2/", "tp-pnt9/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "bottom.htm", "top.htm"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=D", "/", "_borders/", "_fpclass/", "_private/", "_themes/", "amc/", "atwt/", "aw/", "bb/", "days/", "gh/", "gl/", "images/", "oltl/", "passions/", "pc/", "resources/", "test/", "yr/"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=A", "/", "yr/", "test/", "resources/", "pc/", "passions/", "oltl/", "images/", "gl/", "gh/", "days/", "bb/", "aw/", "atwt/", "amc/", "_themes/", "_private/", "_fpclass/", "_borders/"]},
{"link": ["?N=A", "?M=A", "?S=D", "?D=A", "/", "_borders/", "_fpclass/", "_private/", "_themes/", "amc/", "atwt/", "aw/", "bb/", "days/", "gh/", "gl/", "images/", "oltl/", "passions/", "pc/", "resources/", "test/", "yr/"]},
{"link": ["?N=A", "?M=D", "?S=A", "?D=A", "/", "_private/", "images/", "amc/", "atwt/", "aw/", "bb/", "days/", "gh/", "gl/", "oltl/", "passions/", "pc/", "yr/", "resources/", "_fpclass/", "_borders/", "_themes/", "test/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "menuh.css", "menuh_main.css", "new3.css"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "amc_menu.html", "atwt_menu.html", "aw_menu.html", "bb_menu.html", "city_menu.html", "classic_prime.html", "current_prime.html", "day_amc.html", "day_atwt.html", "day_aw.html", "day_bb.html", "day_bottom.html", "day_city.html", "day_days.html", "day_gh.html", "day_gl.html", "day_oltl.html", "day_passions.html", "day_pc.html", "day_sube.html", "day_yr.html", "days_menu.html", "gh_menu.html", "ghns_menu.html", "gl_menu.html", "home_gen.html", "menu.js", "menu_ie4.js", "menu_ie5.js", "menu_moz.js", "menu_ns4.js", "menu_op5.js", "menu_op6.js", "menu_style.txt", "pass_menu.html", "pc_menu.html", "plinks_menu.html", "sniffer.js", "style.js", "sube_menu.html", "yr_menu.html"]}]

Результат № 2. Когда я запускаю сканирование на более поздней стадии, например, с http://tvmegasite.net/transcripts/amc/main, неожиданно я получаю список не только с URL-адресами стенограммы, но и с другими нежелательными сообщениями. с этим:

[{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/amc/main/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/amc/main/", "adbannerinclude4.txt", "custom.js", "custom_orig.js", "exfile.htm", "exfileinclude.txt", "menu.txt", "webringsinclude.txt", "webringsinclude_copy(1).txt"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=D", "/transcripts/amc/", "1998transcripts.shtml", "1999transcripts.shtml", "2001transcripts.shtml", "2002transcripts.shtml", "2003transcripts.shtml", "2004transcripts.shtml", "2005transcripts.shtml", "2006transcripts.shtml", "2007transcripts.shtml", "2008transcripts.shtml", "2009transcripts.shtml", "2010transcripts.shtml", "2011transcripts.shtml", "2013transcripts.shtml", "_overlay/", "favicon.ico", "localresources/", "newtemplate.shtml"]},
{"link": ["?N=A", "?M=A", "?S=D", "?D=A", "/transcripts/amc/", "_overlay/", "localresources/", "favicon.ico", "newtemplate.shtml", "1999transcripts.shtml", "1998transcripts.shtml", "2013transcripts.shtml", "2001transcripts.shtml", "2011transcripts.shtml", "2007transcripts.shtml", "2008transcripts.shtml", "2009transcripts.shtml", "2006transcripts.shtml", "2005transcripts.shtml", "2004transcripts.shtml", "2003transcripts.shtml", "2010transcripts.shtml", "2002transcripts.shtml"]},
{"link": ["?N=A", "?M=D", "?S=A", "?D=A", "/transcripts/amc/", "_overlay/", "favicon.ico", "localresources/", "newtemplate.shtml", "1998transcripts.shtml", "2010transcripts.shtml", "1999transcripts.shtml", "2001transcripts.shtml", "2002transcripts.shtml", "2003transcripts.shtml", "2004transcripts.shtml", "2005transcripts.shtml", "2006transcripts.shtml", "2007transcripts.shtml", "2008transcripts.shtml", "2009transcripts.shtml", "2011transcripts.shtml", "2013transcripts.shtml"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=A", "/transcripts/amc/", "newtemplate.shtml", "localresources/", "favicon.ico", "_overlay/", "2013transcripts.shtml", "2011transcripts.shtml", "2010transcripts.shtml", "2009transcripts.shtml", "2008transcripts.shtml", "2007transcripts.shtml", "2006transcripts.shtml", "2005transcripts.shtml", "2004transcripts.shtml", "2003transcripts.shtml", "2002transcripts.shtml", "2001transcripts.shtml", "1999transcripts.shtml", "1998transcripts.shtml"]}]

0 ответов

Другие вопросы по тегам