Scraping yellowpages

340 Views Asked by At

I am trying to scrape data from people.yellowpages.com i only need is Email,Phone,Address. I have been working on this code lately and it worked for business related organizations. but when it comes to searching for person data it doesn't work. Anyone could help me out what i am doing wrong here.

Note: i need to scrape person data from people.yellowpages.com. When i tried to run the program it goes through for loop and then error.

import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
    """
    
    Function to process 
    : param keyword: search query
    : param place : place name

    """
    url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
    print("retrieving ",url)

    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
                'Cache-Control':'max-age=0',
                'Connection':'keep-alive',
                'Host':'www.yellowpages.com',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
            }
    # Adding retries
    print("helllo")
    for retry in range(10):
        try:
            response = requests.get(url,verify=False, headers = headers )
            print("parsing page")
            print(response)
            sleep(10)
            if response.status_code==200:
                parser = html.fromstring(response.text)
                #making links absolute
                base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
                parser.make_links_absolute(base_url)
                print(base_url)
                XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']" 
                listings = parser.xpath(XPATH_LISTINGS)
                scraped_results = []
                print("wait")

                for results in listings:
                    XPATH_fullname = ".//a[@class='fullname']//text()" 
                    XPATH_phone = ".//div[@itemprop='phone']//text()"
                    XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
                    #XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"

                    raw_fullname = results.xpath(XPATH_fullname)
                    raw_phone = results.xpath(XPATH_phone)  
                    #raw_AGE = results.xpath(XPATH_AGE)
                    raw_address = results.xpath(XPATH_address)
                    print("worked")
                    fullname = ''.join(raw_fullname).strip() if raw_fullname else None
                    phone = ''.join(raw_phone).strip() if raw_phone else None
                    address = ''.join(raw_address).strip() if raw_address else None
                    #age = ''.join(raw_AGE).strip() if raw_zip_code else None


                    business_details = {
                                        'name':fullname,
                                        'telephone':phone,
                                        'address':address,
                                        #'age':AGE,
                                        'listing_url':response.url
                    }
                    scraped_results.append(business_details)

                return scraped_results
                print(business_details)

            elif response.status_code==404:
                print("Could not find a location matching",keyword)
                #no need to retry for non existing page
                break
            else:
                print("Failed to process page")
                return []
                
        except:
            print("Failed to process page")
            return []


if __name__=="__main__":
    
    argparser = argparse.ArgumentParser()
    argparser.add_argument('keyword',help = 'keyword')
    #argparser.add_argument('place',help = 'Place Name')
    
    args = argparser.parse_args()
    keyword = args.keyword
    #place = args.place
    scraped_data =  parse_listing(keyword,) 
    
    if scraped_data:
        print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
        with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
            fieldnames = ['NAME','telephone','ADDRESS','listing_url']
            writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
            writer.writeheader()
            for data in scraped_data:
                writer.writerow(data)
1

There are 1 best solutions below

0
On

NEVER do like that:

except:

you always MUST specify certain exceptions. Lets try manually run requests.get:

(Pdb) requests.get(url,verify=False, headers = headers )
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
....
*** requests.exceptions.TooManyRedirects: Exceeded 30 redirects.

Look at error: requests.exceptions.TooManyRedirects: Exceeded 30 redirects Lets try get without allow_redirect:

(Pdb) response = requests.get(url,verify=False, headers = headers,  allow_redirects=False)
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
(Pdb) response
<Response [301]>
(Pdb) response.headers
{'Date': 'Mon, 18 Nov 2019 09:09:35 GMT', 'Content-Type': 'text/html', 'Content-Length': '178', 'Connection': 'keep-alive', 'Location': 'https://people.yellowpages.com/whitepages/?last_name=john', 'Set-Cookie': 'TS0145ce01=01d0bb65df96e04f8ea20dfc3b81c2fbe967f216df827b11fbedaa89ee06a10f05ae6a0759; Path=/'}
(Pdb) url
'https://people.yellowpages.com/whitepages/?last_name=john'
(Pdb) response.headers["Location"]
'https://people.yellowpages.com/whitepages/?last_name=john'

Do you see that web server always redirect you to same url? May be problem in

'Host':'www.yellowpages.com',

in headers?