Scraping yellowpages

I am trying to scrap data from people.yellowpages.com i only need is Email,Phone,Address. I have been working on this code lately and it worked for business related organizations. but when it comes to searching for person data it doesn't work. Anyone could help me out what i am doing wrong here.

Note: i need to scrap person data from people.yellowpages.com. When i tried to run the program it goes through for loop and then error.

import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
    """

    Function to process 
    : param keyword: search query
    : param place : place name

    """
    url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
    print("retrieving ",url)

    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
                'Cache-Control':'max-age=0',
                'Connection':'keep-alive',
                'Host':'www.yellowpages.com',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
            }
    # Adding retries
    print("helllo")
    for retry in range(10):
        try:
            response = requests.get(url,verify=False, headers = headers )
            print("parsing page")
            print(response)
            sleep(10)
            if response.status_code==200:
                parser = html.fromstring(response.text)
                #making links absolute
                base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
                parser.make_links_absolute(base_url)
                print(base_url)
                XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']" 
                listings = parser.xpath(XPATH_LISTINGS)
                scraped_results = []
                print("wait")

                for results in listings:
                    XPATH_fullname = ".//a[@class='fullname']//text()" 
                    XPATH_phone = ".//div[@itemprop='phone']//text()"
                    XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
                    #XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"

                    raw_fullname = results.xpath(XPATH_fullname)
                    raw_phone = results.xpath(XPATH_phone)  
                    #raw_AGE = results.xpath(XPATH_AGE)
                    raw_address = results.xpath(XPATH_address)
                    print("worked")
                    fullname = ''.join(raw_fullname).strip() if raw_fullname else None
                    phone = ''.join(raw_phone).strip() if raw_phone else None
                    address = ''.join(raw_address).strip() if raw_address else None
                    #age = ''.join(raw_AGE).strip() if raw_zip_code else None


                    business_details = {
                                        'name':fullname,
                                        'telephone':phone,
                                        'address':address,
                                        #'age':AGE,
                                        'listing_url':response.url
                    }
                    scraped_results.append(business_details)

                return scraped_results
                print(business_details)

            elif response.status_code==404:
                print("Could not find a location matching",keyword)
                #no need to retry for non existing page
                break
            else:
                print("Failed to process page")
                return []

        except:
            print("Failed to process page")
            return []


if __name__=="__main__":

    argparser = argparse.ArgumentParser()
    argparser.add_argument('keyword',help = 'keyword')
    #argparser.add_argument('place',help = 'Place Name')

    args = argparser.parse_args()
    keyword = args.keyword
    #place = args.place
    scraped_data =  parse_listing(keyword,) 

    if scraped_data:
        print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
        with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
            fieldnames = ['NAME','telephone','ADDRESS','listing_url']
            writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
            writer.writeheader()
            for data in scraped_data:
                writer.writerow(data)

1 ответ

НИКОГДА не делайте этого:

except:

вы всегда ДОЛЖНЫ указывать определенные исключения. Давайте попробуем вручную запустить requests.get:

(Pdb) requests.get(url,verify=False, headers = headers )
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
....
*** requests.exceptions.TooManyRedirects: Exceeded 30 redirects.

Посмотрите на ошибку: requests.exceptions.TooManyRedirects: Превышено 30 перенаправлений. Давайте попробуем получить без allow_redirect:

(Pdb) response = requests.get(url,verify=False, headers = headers,  allow_redirects=False)
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
(Pdb) response
<Response [301]>
(Pdb) response.headers
{'Date': 'Mon, 18 Nov 2019 09:09:35 GMT', 'Content-Type': 'text/html', 'Content-Length': '178', 'Connection': 'keep-alive', 'Location': 'https://people.yellowpages.com/whitepages/?last_name=john', 'Set-Cookie': 'TS0145ce01=01d0bb65df96e04f8ea20dfc3b81c2fbe967f216df827b11fbedaa89ee06a10f05ae6a0759; Path=/'}
(Pdb) url
'https://people.yellowpages.com/whitepages/?last_name=john'
(Pdb) response.headers["Location"]
'https://people.yellowpages.com/whitepages/?last_name=john'

Вы видите, что веб-сервер всегда перенаправляет вас на один и тот же URL-адрес? Может быть проблема в

'Host':'www.yellowpages.com',

в заголовках?

Другие вопросы по тегам