I am trying to scrape data from people.yellowpages.com i only need is Email,Phone,Address. I have been working on this code lately and it worked for business related organizations. but when it comes to searching for person data it doesn't work. Anyone could help me out what i am doing wrong here.
Note: i need to scrape person data from people.yellowpages.com. When i tried to run the program it goes through for loop and then error.
import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
"""
Function to process
: param keyword: search query
: param place : place name
"""
url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
print("retrieving ",url)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.yellowpages.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
# Adding retries
print("helllo")
for retry in range(10):
try:
response = requests.get(url,verify=False, headers = headers )
print("parsing page")
print(response)
sleep(10)
if response.status_code==200:
parser = html.fromstring(response.text)
#making links absolute
base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
parser.make_links_absolute(base_url)
print(base_url)
XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']"
listings = parser.xpath(XPATH_LISTINGS)
scraped_results = []
print("wait")
for results in listings:
XPATH_fullname = ".//a[@class='fullname']//text()"
XPATH_phone = ".//div[@itemprop='phone']//text()"
XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
#XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"
raw_fullname = results.xpath(XPATH_fullname)
raw_phone = results.xpath(XPATH_phone)
#raw_AGE = results.xpath(XPATH_AGE)
raw_address = results.xpath(XPATH_address)
print("worked")
fullname = ''.join(raw_fullname).strip() if raw_fullname else None
phone = ''.join(raw_phone).strip() if raw_phone else None
address = ''.join(raw_address).strip() if raw_address else None
#age = ''.join(raw_AGE).strip() if raw_zip_code else None
business_details = {
'name':fullname,
'telephone':phone,
'address':address,
#'age':AGE,
'listing_url':response.url
}
scraped_results.append(business_details)
return scraped_results
print(business_details)
elif response.status_code==404:
print("Could not find a location matching",keyword)
#no need to retry for non existing page
break
else:
print("Failed to process page")
return []
except:
print("Failed to process page")
return []
if __name__=="__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('keyword',help = 'keyword')
#argparser.add_argument('place',help = 'Place Name')
args = argparser.parse_args()
keyword = args.keyword
#place = args.place
scraped_data = parse_listing(keyword,)
if scraped_data:
print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
fieldnames = ['NAME','telephone','ADDRESS','listing_url']
writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
for data in scraped_data:
writer.writerow(data)
NEVER do like that:
you always MUST specify certain exceptions. Lets try manually run requests.get:
Look at error: requests.exceptions.TooManyRedirects: Exceeded 30 redirects Lets try get without allow_redirect:
Do you see that web server always redirect you to same url? May be problem in
in headers?