Solving Hcaptcha using 2captcha to automate website search (Python)

462 Views Asked by At

I'm trying to automate a web search via Python.

The website is behind hCaptcha but I'm using a 2captcha solver.

Although, I've replicated web browser's behavior, I'm still being asked to solve the hCaptcha again.

Here's what I've tried:

import httpx
import trio
from twocaptcha import TwoCaptcha


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
    'Referer': 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange',
    'Origin': 'https://iapps.courts.state.ny.us'

}


API_KEY = 'hidden'


async def solve_captcha():
    solver = TwoCaptcha(API_KEY)
    return solver.hcaptcha(
        sitekey='600d5d8e-5e97-4059-9fd8-373c17f73d11',
        url='https://iapps.courts.state.ny.us/'
    )['code']


async def main():
    async with httpx.AsyncClient(base_url='https://iapps.courts.state.ny.us/nyscef/', headers=headers, follow_redirects=True) as client:
        r = await client.post('CaseSearch?TAB=courtDateRange')
        print('[*] - Solving CAPTCHA!')
        cap = await solve_captcha()
        print('[*] - CAPTCHA Solved')
        # Court: Chautauqua County Supreme Court
        data = {
            'selCountyCourt': '4667226',
            'txtFilingDate': '02/14/2024',
            'g-recaptcha-response': cap,
            'h-captcha-response': cap,
            'btnSubmit': 'Search',
        }
        r = await client.post('https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange', data=data)
        with open('r.html', 'w') as f:
            f.write(r.text)

if __name__ == "__main__":
    trio.run(main)
1

There are 1 best solutions below

0
thetaco On

I adjusted your code to repeatedly solve Captchas if they appeared. After going through 10 captchas in a row, I assumed the website knew I was scraping, and would infinitely provide captchas; for that reason I have created a different solution that will work as well as save money from 2captcha fees.

This solution Selenium, and requires the undetected_chromedriver. The driver is open source and located here. It can be installed with the following:

pip install undetected-chromedriver

Using this chromedriver allows you to be undetected to almost all modern detection methods. It also saves time from not doing captchas, and saves money by not paying for twocaptcha's services. Here is the code that scrapes your desired page:

import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import random

def main():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
    
    driver = uc.Chrome(options=options)
    
    url = 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange'
    driver.get(url)
    time.sleep(random.uniform(1, 5))
    
    WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, 'selCountyCourt')))
    time.sleep(random.uniform(1, 5)) # human-like wait time
    
    dropdown = Select(driver.find_element(By.ID, 'selCountyCourt'))
    dropdown.select_by_value('4667226')  # value for 'Chautauqua County Supreme Court'
    time.sleep(random.uniform(1, 5))    
    date_input = driver.find_element(By.ID, 'txtFilingDate')
    date_input.send_keys('02/14/2024')  #value of the desired date
    time.sleep(random.uniform(1, 5))
    date_input.send_keys(Keys.ENTER)
    time.sleep(3)
    # save the html
    with open('page.html', 'w', encoding='utf-8') as f:
        f.write(driver.page_source)
    driver.quit()

if __name__ == "__main__":
    main()

This saves the full html of your desired page to "page.html".

NOTE: if you initially get an error about the Chromedriver version not being supported, close your browser and run the module with no browser open.

If you wish to see your prior code running, and see that it infinitely runs into captchas, here is the code I used to determine that:

import httpx
import trio
import random
from twocaptcha import TwoCaptcha
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
    'Referer': 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange',
    'Origin': 'https://iapps.courts.state.ny.us'
}

API_KEY = 'hidden'

async def solve_captcha(sitekey, page_url, attempts=3):
    solver = TwoCaptcha(API_KEY)
    for attempt in range(attempts):
        try:
            solution = await trio.to_thread.run_sync(
                lambda: solver.hcaptcha(
                    sitekey=sitekey,
                    url=page_url,
                    timeout=180
                )
            )
            return solution['code']
        except Exception as e:
            print(f"attempt {attempt + 1}: error solving captcha: {e}")
            if attempt < attempts - 1:
                await trio.sleep(10)  # wait 10 seconds before retrying
    return None

async def main():
    async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
        page_url = 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange'
        await trio.sleep(random.uniform(1, 3))  # mimic real waiting period
        
        cap_solution = None
        attempts = 0
        
        while True:  # start captcha loop
            attempts += 1
            print(f"attempt # {attempts} to access the page")
            
            data = {
                'selCountyCourt': '4667226',
                'txtFilingDate': '02/14/2024',
                'h-captcha-response': cap_solution
            }
            response = await client.post(page_url, data=data)
            
            # check if captcha is on page
            soup = BeautifulSoup(response.text, 'html.parser')
            captcha_div = soup.find('div', class_='h-captcha')
            if captcha_div:
                print(f"captcha found, solving...")
                new_sitekey = captcha_div['data-sitekey']
                cap_solution = await solve_captcha(new_sitekey, page_url)
                if not cap_solution:
                    print('faled to solve captcha')
                    return
                
                # if solved, the loop will continue and use the new solution
                print('captcha solved, retrying...')
                await trio.sleep(random.uniform(2, 5))
            else:
                # if no captcha is found, break the loop
                break
        
        if response.status_code == 200: # save page if loop is broken
            with open('r.html', 'w') as f:
                f.write(response.text)
            print("all captchas completed, response saved.")
        else:
            print("failed")

if __name__ == "__main__":
    trio.run(main)