Проблемы экспорта Scrapy JSON
Я следил за некоторыми онлайн-уроками, чтобы использовать Scrapy для удаления Craigslist для электронных писем. У меня есть этот код, но когда я запускаю команду и экспортирую в файл json, он создает файл, но единственное, что там есть, - это один '['.
Любая помощь будет принята с благодарностью. Ниже мой код:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy_demo.items import ScrapyDemoItem
import urlparse
from scrapy.http.request import Request
class ScrapyDemoSpider(BaseSpider):
name = "scrapy_demo"
allowed_domains = ["buffalo.craigslist.org"]
start_urls = ['http://buffalo.craigslist.org/search/cps/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select('//....')
links = []
#scrap listings page to get listing links
for listing in listings:
link = listing.select('..../@href').extract()[0]
links.append(link)
#parse listing url to get content of the listing page
for link in links:
item = ScrapyDemoItem()
item['link'] = link
yield Request(urlparse.urljoin(response.url, link), meta={'item': item}, callback=self.parse_listing_page)
#get next button link
next_page = hxs.select("//..../@href").extract()[0]
if next_page:
yield Request(urlparse.urljoin(response.url, next_page), self.parse)
#scrap listing page to get content
def parse_listing_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
item['title'] = hxs.select('//..../text()').extract()[0]
item['content'] = hxs.select('//..../text()').extract()[0]
yield item
1 ответ
Решение
Несколько вопросов здесь.
Основная проблема заключается в недопустимых выражениях внутри select()
звонки.
Помимо этого:
- использование
response.xpath()
или жеresponse.css()
нет необходимостиHtmlXPathSelector
больше не - нет необходимости создавать экземпляр
Item
экземпляр вparse()
перезвонить и пройтиmeta
, Получить URL отresponse.url
вparse_listing_page()
Перезвоните
Улучшен рабочий код:
import urlparse
from scrapy.spider import BaseSpider
from scrapy.http.request import Request
from scrapy_demo.items import ScrapyDemoItem
class ScrapyDemoSpider(BaseSpider):
name = "scrapy_demo"
allowed_domains = ["buffalo.craigslist.org"]
start_urls = ['http://buffalo.craigslist.org/search/cps/']
def parse(self, response):
# processing listings
for listing in response.css('p.row > a[data-id]'):
link = listing.xpath('@href').extract()[0]
yield Request(urlparse.urljoin(response.url, link), callback=self.parse_listing_page)
# following next page
next_page = response.xpath('//a[contains(@class, "next")]/@href').extract()
print next_page
if next_page:
yield Request(urlparse.urljoin(response.url, next_page[0]), callback=self.parse)
def parse_listing_page(self, response):
item = ScrapyDemoItem()
item['link'] = response.url
item['title'] = response.xpath('//title/text()').extract()[0].strip()
item['content'] = response.xpath('//section[@id="postingbody"]/text()').extract()[0].strip()
yield item
Если вы запустите spider, то в выходном файле JSON вы увидите:
[
{"content": "Using a web cam with your computer to video communicate with your loved ones has never been made easier and it's free (providing you have an Internet connection). With the click of a few buttons, you are sharing your live video and audio with the person you are communicating with. It's that simple. When you are seeing and hearing your grand kids live across the country or halfway around the world, web camming is the next best thing to being there!", "link": "http://buffalo.craigslist.org/cps/4784390462.html", "title": "Web Cam With Your Computer With Family And Friends"},
{"content": "Looking to supplement or increase your earnings?", "link": "http://buffalo.craigslist.org/cps/4782757517.html", "title": "1k in 30 Day's"},
{"content": "Like us on Facebook: https://www.facebook.com/pages/NFB-Systems/514380315268768", "link": "http://buffalo.craigslist.org/cps/4813039886.html", "title": "NFB SYSTEMS COMPUTER SERVICES + WEB DESIGNING"},
{"content": "Like us on Facebook: https://www.facebook.com/pages/NFB-Systems/514380315268768", "link": "http://buffalo.craigslist.org/cps/4810219714.html", "title": "NFB Systems Computer Repair + Web Designing"},
{"content": "I can work with you personally and we design your site together (no outsourcing or anything like that!) I'll even train you how to use your brand new site. (Wordpress is really easy to use once it is setup!)", "link": "http://buffalo.craigslist.org/cps/4792628034.html", "title": "I Make First-Class Wordpress Sites with Training"},
...
]