Twitter parsing with Selenium python

64 Views Asked by At

I'm newguy in parsing with selenium. i got a task "you should get 10 Elon Musk's last 10 twitts and show text of it in terminal". i tried do that like this:


import os

from selenium.webdriver.common.by import By
from selenium_stealth import stealth
from seleniumwire import webdriver
from dotenv import load_dotenv

load_dotenv()
LOGIN = os.getenv('LOGIN')
PASSWORD = os.getenv('PASSWORD')
IP_ADRESS = os.getenv('FR_IP_ADRESS')
PORT = os.getenv('FR_PORT')

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('--ignore-certificate-errors')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

proxy_options = {
    'proxy': {
        'http': f'http://{LOGIN}:{PASSWORD}@{IP_ADRESS}:{PORT}'
    }
}

try:
    link = 'https://twitter.com/elonmusk'
    browser = webdriver.Chrome(
        options=options,
        seleniumwire_options=proxy_options
    )

    stealth(
        browser,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
    )

    browser.get(link)

    browser.implicitly_wait(20)
    target = browser.find_elements(
            By.CSS_SELECTOR, '[data-testid="tweet"]'
        )

    browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

    twits = browser.find_elements(
            By.CSS_SELECTOR, '[data-testid="tweet"] [data-testid="tweetText"]'
        )

    for twit in twits[::10]:
        print(twit.text)

finally:
    time.sleep(20)
    browser.quit()

The first problem is that scroling browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") dosen't work, but when i used it in the first version of code, it worked normal. The second problem is that my twitts colector dosen't collect anything or collects different values every next time. I tried use different methods of scroll, but they either work or don't work. There is similar situation in twtits collector. I think that the problem could be conect with twitter's work method, but i can't be sure because i have not enough exp in parsing. I hope you will help me. Thank you in advance

1

There are 1 best solutions below

0
Aariyan Patel On BEST ANSWER

selenium find elements is not working so, we can pass HTML content to BeautifulSoup to Extract tweet text.

here is full script to get first 10 tweet text

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

options = Options()
#options.add_argument("user-data-dir=C:\\Users\\yourusername\\AppData\\Local\\Google\\Chrome Beta\\User Data")
options.add_argument("profile-directory=Default")

driver_service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=driver_service, options=options)

try:
    link = 'https://twitter.com/elonmusk'
    driver.get(link)

    tweet_text_elements = []

    # Scroll 5 times to load more tweets
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(2) 

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all elements with 'tweetText'
        tweet_text_elements.extend(soup.find_all('div', {'data-testid': 'tweetText'}))

    # first 10 tweet
    for i, tweet_text_element in enumerate(tweet_text_elements[:10]):
        tweet_text = tweet_text_element.text.strip().replace("\n", "")
        print(f"Tweet {i+1}: {tweet_text}")

finally:
    driver. Quit()