How to scrape information from multiple pages of results in Google Scholar

241 Views Asked by At

The code that I have written is meant to return the information from multiple pages of results from Google Scholar. I have integrated selenium and BeautifulSoup. I have been successful in scraping the data for the first page of results; however, they are the only results that I can return. This is despite me changing the url in each iteration to direct to the next page. Furthermore, I can confirm that the url successfully loaded into the selenium driver by using driver.current_url attribute and comparing it with the one that I have going into the for loop.

I have modified the url so that the "start="is equal to value of the next page. Plus when I print these urls into the terminal I can then search them and it brings up the page.

I have not made this work flow headerless because I wanted to solve the captchas as they cropped up. Therefore, this should not be an issue either.

I would be using the same query, I just want to iterate through the different pages of results that are given during a google search.

I would really appreciate any help on this.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import time
import random
import os
import re

def get_tags(doc):
    paper_tag = doc.select('[data-lid]')
    cite_tag = doc.select('a:contains("Cited by")') 
    link_tag = doc.find_all('h3',{"class" : "gs_rt"})
    author_tag = doc.find_all("div", {"class": "gs_a"})
    return paper_tag, cite_tag, link_tag, author_tag 

def get_papertitle(paper_tag):
    paper_names = []
    for tag in paper_tag:
        paper_names.append(tag.select('h3')[0].get_text())    
    return paper_names

def get_link(link_tag):
    links = []
    
    for i in range(len(link_tag)) :
        links.append(link_tag[i].a['href']) 
        
    return links 

def get_author_year_publi_info(authors_tag):
    years = []
    publication = []
    authors = []
    for i in range(len(authors_tag)):
        authortag_text = (authors_tag[i].text).split()
        year = int(re.search(r'\d+', authors_tag[i].text).group())
        years.append(year)
        publication.append(authortag_text[-1])
        author = authortag_text[0] + ' ' + re.sub(',','', authortag_text[1])
        authors.append(author)
    return years , publication, authors

def get_citecount(cite_tag):
    cite_count = []
    for i in cite_tag:
        if i is None or cite_tag is None:  # if paper has no citatation then consider 0
            cite_count.append(0)
        else:
            tmp = re.search(r'\d+', i.text) 
            """
            its handle the None type object error and re use to remove the string 
            " cited by " and return only integer value
            """
            if tmp is None:
                cite_count.append(0)
            else:
                cite_count.append(int(tmp.group()))
    return cite_count

def get_paperinfo(url, query):
    #download the page
    driver = webdriver.Chrome()
    
    driver.get(url)
    
    time.sleep(30)
    print("URL used:\t", url, "\n")
    print("\nLoaded URL:\t", driver.current_url, "\n")
    paper_doc = BeautifulSoup(driver.page_source,'html.parser')
    driver.quit()
    return paper_doc

paper_repos_dict = {
    "Paper Title": [],
    "Year": [],
    "Author": [],
    "Citation": [],
    "Publication": [],
    "Url of Paper": [],
}

def add_in_paper_repo(papername, year, author, cite, publi, link):#
    paper_repos_dict['Paper Title'].extend(papername)
    paper_repos_dict['Year'].extend(year)
    paper_repos_dict['Author'].extend(author)
    paper_repos_dict['Citation'].extend(cite)
    paper_repos_dict['Publication'].extend(publi)
    paper_repos_dict['Url of Paper'].extend(link)
    emptykey = [k for (k,v) in paper_repos_dict.items() if len(paper_repos_dict[k]) == 0]
    print(emptykey)
    return pd.DataFrame(paper_repos_dict)


# Set the URL for the Google Scholar search
search_url = "https://scholar.google.com/"

# Set the query for the Google Scholar search
search_query = "seagrass habitat mapping unmanned aerial vehicles"

df = pd.DataFrame()

for i in range (0, 110, 10):
    
    initial_url = os.path.join(search_url[:-1], "scholar?").replace("\\", "/")
    page_beg = "start={}".format(i)
    remaining_url = "&q=" + search_query.replace(" ", "+") +"&hl=en&as_sdt=0,5"
    
    real_url = initial_url + page_beg + remaining_url
    
    # function for the get content of each page
    doc = get_paperinfo(real_url, search_query)
    
    # function for the collecting tags
    paper_tag, cite_tag, link_tag, author_tag = get_tags(doc) 
    
    # paper title from each page
    papername = get_papertitle(paper_tag)
    
    year, publication, author = get_author_year_publi_info(author_tag)
    
    # cite count of the paper 
    cite = get_citecount(cite_tag)
    
    # url of the paper
    link = get_link(link_tag)
    
    # add in paper repo dict
    subdf = add_in_paper_repo(papername, year, author, cite, publication, link)
    
    print("\n:", subdf.head(10))
    
    df = df.append(subdf)
    
    time.sleep(random.randint(20,40))

# Save the DataFrame to a CSV file

print("\n:", df.shape)

print("\n:", df.head(20))

df.to_csv(
    "D:/Documents/Post_Doc/Writing_and_Learning/Lit_Review/google_scholar_results_new.csv",
    index = False
)
print("\nI'm Finished!!!")

I want to have the data from multiple pages on google scholar; however, I am only getting the first page each time. I would like a solution without having to resort to serpapi.

This is a subset of the first 20 returned results. Please note that the results for the second page are a duplicate of the first one.

2

There are 2 best solutions below

4
On

As per this documentation of the get-page-source command, driver.page_source will return to you the last loaded page. This is an important distinction to make. I'm not so sure about Google Scholar, but my assumption is that pages are loaded via script. So you load the first page at first and then execute the script to give the other pages.

Try printing out the result of your GET request and see what you receive. You'll be able to figure out what is happening with the response the driver gets from its GET request.

Anyways, if it is indeed as I think it is, this will be what you want to do after loading the page:

pageSource = driver.execute_script("return document.documentElement.outerHTML")

As seen in this StackOverflow answer. This will execute the scripts necessary to properly load the HTLM content of the page.

0
On

The os.path.join is not designed for URLs. There's a urllib.parse.urljoin for joining URLs and urllib.parse.urlencode for encoding URL components (replace whitespace characters with +, etc.)

from urllib.parse import urljoin, urlencode

search_url = "https://scholar.google.com/"
search_query = "seagrass habitat mapping unmanned aerial vehicles"

params = { "q": search_query, "hl": "en", "as_std": "0,5", "start": "10" }

urljoin(search_url, urlencode(params))

Output

'https://scholar.google.com/q=seagrass+habitat+mapping+unmanned+aerial+vehicles&hl=en&as_std=0%2C5&start=10'

Here's the SerpApi blog post about the Google Scholar Results scraping.

import json
from serpapi import SerpApiClient


search_query = "web scraping and LLMs"

params = {
    "api_key": "...",            # Get your SerpApi API key at https://serpapi.com/manage-api-key
    "engine": "google_scholar",  # search engine
    "q": search_query,
    "hl": "en",                  # language
    # "as_ylo": "2017",          # from 2017
    # "as_yhi": "2021",          # to 2021
}

search = SerpApiClient(params)

publications = []

for page in search.pagination():
    page_number = page.get('serpapi_pagination', {}).get('current')
    print(f"Currently extracting page #{page_number}..")

    for result in page.get("organic_results", []):
        position = result["position"]
        title = result["title"]
        publication_info_summary = result["publication_info"]["summary"]
        result_id = result["result_id"]
        link = result.get("link")
        result_type = result.get("type")
        snippet = result.get("snippet")

        publications.append({
            "page_number": page_number,
            "position": position + 1,
            "result_type": result_type,
            "title": title,
            "link": link,
            "result_id": result_id,
            "publication_info_summary": publication_info_summary,
            "snippet": snippet,
        })

print(json.dumps(publications, indent=2, ensure_ascii=False))

Output

// ...
{
  "page_number": 93,
  "position": 10,
  "result_type": null,
  "title": "Reproductive biology of Bacopa monnieri.",
  "link": "https://www.cabdirect.org/cabdirect/abstract/20013163708",
  "result_id": "Pqvbi-PQGBUJ",
  "publication_info_summary": "S Mathur, S Kumar - Journal of Genetics & Breeding, 2001 - cabdirect.org",
  "snippet": "B. monnieri (2n= 64), family Scrophulariaceae, a prostrate herb common to river banks and other water bodies has a widespread distribution in the world. The plant is an important …"
},
{
  "page_number": 94,
  "position": 1,
  "result_type": null,
  "title": "Studies on biology of banana pseudostem weevil, Odoiporus longicollis Olivier.",
  "link": "https://www.cabdirect.org/cabdirect/abstract/20133014553",
  "result_id": "3cghTZqAhKIJ",
  "publication_info_summary": "M Thippaiah, CTA Kumar, C Shivaraju… - Journal of Insect …, 2012 - cabdirect.org",
  "snippet": "Studies on biology of Odoiporus longicollis Oliver revealed that a single female laid 15 to 21 eggs in one season. Mean incubation period varied between 3-5 days during June to …"
},
// ...

Links

Disclaimer: I work at SerpApi.com.