Scraping yellowpages
I am trying to scrap data from people.yellowpages.com i only need is Email,Phone,Address. I have been working on this code lately and it worked for business related organizations. but when it comes to searching for person data it doesn't work. Anyone could help me out what i am doing wrong here.
Note: i need to scrap person data from people.yellowpages.com. When i tried to run the program it goes through for loop and then error.
import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
"""
Function to process
: param keyword: search query
: param place : place name
"""
url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
print("retrieving ",url)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.yellowpages.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
# Adding retries
print("helllo")
for retry in range(10):
try:
response = requests.get(url,verify=False, headers = headers )
print("parsing page")
print(response)
sleep(10)
if response.status_code==200:
parser = html.fromstring(response.text)
#making links absolute
base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
parser.make_links_absolute(base_url)
print(base_url)
XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']"
listings = parser.xpath(XPATH_LISTINGS)
scraped_results = []
print("wait")
for results in listings:
XPATH_fullname = ".//a[@class='fullname']//text()"
XPATH_phone = ".//div[@itemprop='phone']//text()"
XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
#XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"
raw_fullname = results.xpath(XPATH_fullname)
raw_phone = results.xpath(XPATH_phone)
#raw_AGE = results.xpath(XPATH_AGE)
raw_address = results.xpath(XPATH_address)
print("worked")
fullname = ''.join(raw_fullname).strip() if raw_fullname else None
phone = ''.join(raw_phone).strip() if raw_phone else None
address = ''.join(raw_address).strip() if raw_address else None
#age = ''.join(raw_AGE).strip() if raw_zip_code else None
business_details = {
'name':fullname,
'telephone':phone,
'address':address,
#'age':AGE,
'listing_url':response.url
}
scraped_results.append(business_details)
return scraped_results
print(business_details)
elif response.status_code==404:
print("Could not find a location matching",keyword)
#no need to retry for non existing page
break
else:
print("Failed to process page")
return []
except:
print("Failed to process page")
return []
if __name__=="__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('keyword',help = 'keyword')
#argparser.add_argument('place',help = 'Place Name')
args = argparser.parse_args()
keyword = args.keyword
#place = args.place
scraped_data = parse_listing(keyword,)
if scraped_data:
print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
fieldnames = ['NAME','telephone','ADDRESS','listing_url']
writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
for data in scraped_data:
writer.writerow(data)
1 ответ
НИКОГДА не делайте этого:
except:
вы всегда ДОЛЖНЫ указывать определенные исключения. Давайте попробуем вручную запустить requests.get:
(Pdb) requests.get(url,verify=False, headers = headers )
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
....
*** requests.exceptions.TooManyRedirects: Exceeded 30 redirects.
Посмотрите на ошибку: requests.exceptions.TooManyRedirects: Превышено 30 перенаправлений. Давайте попробуем получить без allow_redirect:
(Pdb) response = requests.get(url,verify=False, headers = headers, allow_redirects=False)
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
(Pdb) response
<Response [301]>
(Pdb) response.headers
{'Date': 'Mon, 18 Nov 2019 09:09:35 GMT', 'Content-Type': 'text/html', 'Content-Length': '178', 'Connection': 'keep-alive', 'Location': 'https://people.yellowpages.com/whitepages/?last_name=john', 'Set-Cookie': 'TS0145ce01=01d0bb65df96e04f8ea20dfc3b81c2fbe967f216df827b11fbedaa89ee06a10f05ae6a0759; Path=/'}
(Pdb) url
'https://people.yellowpages.com/whitepages/?last_name=john'
(Pdb) response.headers["Location"]
'https://people.yellowpages.com/whitepages/?last_name=john'
Вы видите, что веб-сервер всегда перенаправляет вас на один и тот же URL-адрес? Может быть проблема в
'Host':'www.yellowpages.com',
в заголовках?