So I am trying to use selenium to find specific items on a webpage, but I want to make it universal for similar pages. My current code is below.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import requests
driver = webdriver.Chrome(executable_path='/Applications/chromedriver_mac_arm64 (1)/chromedriver')
driver.get(
"https://www.usgbc.org/projects/?Country=%5B%22United+States%22%5D&Rating+System=%5B%22New+Construction%22%5D&Rating+Version=%5B%22v2009%22%5D&Certification=%5B%22Platinum%22%5D&State=%5B%22Texas%22%5D")
# check this is the right website
# print(driver.title)
# list of building names to quickly check what has been added
buildings = []
locations = []
# lists for items on project page
sqft_amount = []
# dataframe to collect all building information
df_main = pd.DataFrame()
# dataframe to collect building profile data on page
df_profile_data = pd.DataFrame()
# dataframe to collect scorecard data
df_scorecard = pd.DataFrame()
# make this happen for next button
while True:
try:
project_profiles = driver.find_elements(By.CLASS_NAME, "grid-item--title")
for i in range(len(project_profiles)):
# Wait for page to load as order of elements will be incorrect otherwise
time.sleep(1)
project_profiles = driver.find_elements(By.CLASS_NAME, "grid-item--title") # Find the list again
# append and add to df
building_name = project_profiles[i].text
buildings.append(building_name)
print(building_name)
# enable if checking all
# building profile page information grab##############################
# load building profile page
building_profile_link = driver.find_element(By.XPATH, f"//div[@id='result-grid']//h1[text()='{building_name}']")
building_profile_link.click()
time.sleep(1)
# address
address = driver.find_elements(By.CLASS_NAME, 'projectAddress')
for i in address:
building_address = i.text
locations.append(building_address)
print(locations)
# get values from tables on page
row_data = []
col_data = []
# the row typicaly starts with td[]##
# copy path of first row then make it end in /td, copy Xpath not entire path
rows = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/tbody/tr/td")))
# the col typicaly starts with th[]##
# copy path of first row then make it end in /th, copy Xpath not entire path
columns = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/thead/tr/th")))
for row in rows:
row_data.append(row.text)
for cols in columns:
col_data.append(cols.text)
#time.sleep(5)
print(row_data, "row")
print(col_data, "col")
df_profile_data.append(row_data)
################################SQFT##########################################
#import requests
get_url = driver.current_url
print("The current url is:"+str(get_url))
URL = get_url
html = requests.get(URL).content
df_list = pd.read_html(html)
df = df_list[-1]
SQFT = df.iloc[0,1]
SQFT = SQFT.replace('sq ft', '')
sqft_amount.append(SQFT)
#print(SQFT)
######### load credit score card page##########################################
#building_scorecard_link = driver.find_element(By.XPATH, f"//div[@id='project-details--wrapper']//h1[text()='Scorecard']")
building_scorecard_link = driver.find_element(By.PARTIAL_LINK_TEXT, 'Scorecard')
#/html/body/div[1]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/table/tbody/tr/td[2]
building_scorecard_link.click()
time.sleep(2)
# grab data on categories point totals
point_data = []
point_total = driver.find_elements(By.CLASS_NAME, 'category-score')
for points in point_total:
point_data.append(points.text)
print('cat scores', point_data)
# category names
cat_names = []
#expand credit areas
sus_link = driver.find_elements(By.CLASS_NAME, 'category-title')
#sus_link = WebDriverWait(driver, 15).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'category-title')))
for i in sus_link:
i.click()
print('cat_names:', i.text)
cat_names.append(i.text)
#if i == 'SUSTAINABLE SITES':
# grab specfic credit name
#rows = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/tbody/tr/td")))
# so far css selector works but also times out
# tag_name span also works but it pulls everything, data cleaning needed
# class name not working
credit_names = []
content = driver.find_elements(By.CSS_SELECTOR, 'span.credit-name')
#content = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'cred-name')))
#content = driver.find_elements(By.CLASS_NAME, 'credit-name')
for i in content:
#print(i.text)
credit_names.append(i.text)
print('cred_name', credit_names)
# grab data on category points
sus_data = []
content = driver.find_elements(By.CLASS_NAME, 'num')
for points in content:
sus_data.append(points.text)
print('sus_scores', sus_data)
# add all these things into df
df_scorecard = pd.DataFrame()
# exit scorecard page
driver.back()
# exit building profile page
driver.back()
# move onto next page#############################################
# Perform your desired actions on each page here
# Add some wait here for next page load
# Check if the next button is disabled
next_button = WebDriverWait(driver, 5).until(EC.presence_of_element_located(
(By.XPATH, '//div[@id="result-grid"]//a[text()="Next"]')))
if next_button.get_attribute('Disabled'):
time.sleep(1)
break # Exit the loop if the next button is disabled
else:
# Click the next button to navigate to the next page
time.sleep(1)
next_button.click()
except IndexError:
break
# quit the chrome driver option
driver.quit()
Basically, when the code reaches the # grab specific credit name block and the # grab data on category points, if I use By.CSS without webdriverwait it finds most of the items, but not all of them, additionally after a certain number of buildings have been viewed it may time out. When I try to use webdriver wait it usually times out. If I use By.CLASS_NAME selenium cannot find any elements, even though the class name is correct. If I use By.tag_name it finds all of the 'span' elements on the page which is not ideal while trying to find 'credit-name' or 'num' specific elements.
Overall I need help understanding why the methods above are not working and any other suggestions anyone may have. I am decent at coding in python but am very new at selenium.
Thanks!