Creating a 3 column Data Frame in Pandas

94 Views Asked by At

I want to scrape a soccer website to create a dataset in Pandas. I don't know how to input the scraped data information of the players into 3 columns(name, league, soccer team) and also add the country to fit into a table/data frame.

The information has been scraped, all be it not very neatly, but I'm not sure (nor do I know how) i should create an array and loop the information into lists or arrays.

from bs4 import BeautifulSoup

import requests

url = 'https://ng.soccerway.com/players/players_abroad/nigeria/'
req = requests.get(url,headers={'User-Agent':'Mozilla/5.0'})
page = req
soup = BeautifulSoup(page.text, 'html')
table = soup.find_all('table', class_="playersabroad table")
player_country = soup.find_all('th')
player_country_header = [country.text.strip() for country in player_country]

print(player_country_header)

import pandas as pd
import numpy as np

df = pd.DataFrame(columns = ['player-name', 'League', 'team_name'])
#df = pd.DataFrame(columns = player_country_header ) df

table_data = soup.find_all('td')
    player_data_list=[data.text.strip() for data in table_data]
    #length = len(df)
    #df.loc[length] = player_data_list
    print(player_data_list)
2

There are 2 best solutions below

3
On BEST ANSWER

With , here is a proposition with a post-processed read_html :

cols = ["player-name", "League", "team_name"]

tmp = pd.read_html(requests.get(
    url, headers={"User-Agent": "Mozilla/5.0"}).content)[0]
    # the `#storage_options` argument throws an HTTPError

df = (
    tmp.T.reset_index().T # to slip down the incorrect 'England' header
        .assign(country=lambda x: x.pop(3).str.split(".").str[0].ffill())
        .iloc[1:].loc[tmp.iloc[:, -1].isna()]
        .set_axis(cols + ["country"], axis=1)
)

Output :

print(df)

      player-name          League          team_name  country
0        A. Iwobi  Premier League             Fulham  England
1      T. Awoniyi  Premier League  Nottingham Forest  England
2         O. Aina  Premier League  Nottingham Forest  England
3       F. Onyeka  Premier League          Brentford  England
4       C. Bassey  Premier League             Fulham  England
...           ...             ...                ...      ...
1078   S. Danjuma   Yemeni League      Al Ahli San'a    Yemen
1079  M. Alhassan   Yemeni League    Yarmuk al Rawda    Yemen
1080     A. Nweze   Yemeni League    Yarmuk al Rawda    Yemen
1081  A. Olalekan   Yemeni League      Al Sha'ab Ibb    Yemen
1082     A. Adisa   Yemeni League          Al Urooba    Yemen

[975 rows x 4 columns]
2
On

Interesting, I solved it this way

from bs4 import BeautifulSoup
import requests
import pandas as pd

url = 'https://ng.soccerway.com/players/players_abroad/nigeria/'
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
page = req
soup = BeautifulSoup(page.text, 'html.parser')


names = []
leagues = []
teams = []
countries = []


table = soup.find('table', class_='playersabroad table')
rows = table.find_all('tr')[1:]  

for row in rows:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]


    if len(cols) >= 3:
        names.append(cols[0])
        leagues.append(cols[1])
        teams.append(cols[2])

        country = soup.find('th').text.strip()
        countries.append(country)


data = {
    'player-name': names,
    'league': leagues,
    'team_name': teams,
    'country': countries
}

df = pd.DataFrame(data)

print(df)