I have been trying to convert table data extracted using Selenium into Pandas DataFrame.
Here is my code:
import os
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import csv
import sys
driver = webdriver.Chrome()
driver.maximize_window()
action_chains = ActionChains(driver)
wait = WebDriverWait(driver, 10)
def wait_for_element_location_to_be_stable(element):
initial_location = element.location
previous_location = initial_location
start_time = time.time()
while time.time() - start_time < 1:
current_location = element.location
if current_location != previous_location:
previous_location = current_location
start_time = time.time()
time.sleep(0.4)
def continuous_scroll_with_result(by, locator):
result = wait.until(EC.presence_of_all_elements_located((by, locator)))
while True:
temp = result[-1]
action_chains.scroll_to_element(temp).perform()
for i in range(3):
action_chains.send_keys(Keys.ARROW_DOWN).perform()
time.sleep(0.5)
wait_for_element_location_to_be_stable(temp)
result = wait.until(EC.presence_of_all_elements_located((by, locator)))
if result[-1] == temp:
break
return result
driver.get('https://au.finance.yahoo.com/')
time.sleep(5)
search_box = driver.find_element(By.ID, 'yfin-usr-qry')
search_box.send_keys('TSLA')
time.sleep(5)
search_box.submit()
time.sleep(5)
historical_data = driver.find_element(By.XPATH, '//*[@id="quote-nav"]/ul/li[4]/a/span')
action = ActionChains(driver)
action.click(on_element = historical_data)
action.perform()
time.sleep(5)
Table = continuous_scroll_with_result(By.CSS_SELECTOR, '[data-test=historical-prices] tbody tr')
Table_head = continuous_scroll_with_result(By.CSS_SELECTOR, '[data-test=historical-prices] thead')
for ele in Table:
soup1= bs(ele.get_attribute('outerHTML'), 'html.parser')
Table_data = []
for row in soup1.find_all('tr'):
columns = row.find_all('td')
output_row = []
for column in columns:
output_row.append(column.text)
Table_data.append(output_row)
for elem in Table_head:
soup2= bs(elem.get_attribute('outerHTML'), 'html.parser')
Table_headers = []
for th in soup2.find_all('th'):
Table_headers.append(th.text)
df = pd.DataFrame(Table_data, columns=Table_headers)
However, it only got me the last row of data when attempting to turn the data into Pandas DataFrame. I’m hoping to get every extracted row of data in DataFrame.
Could anyone please help me with this ?
>Solution :
This happens because of the way your for loop is constructed, your soup1 just ends up being the last element. Fixed it by properly indenting your code and making minor changes:
Table_data = []
for ele in Table:
soup1= bs(ele.get_attribute('outerHTML'), 'html.parser')
for row in soup1.find_all('tr'):
columns = row.find_all('td')
output_row = []
for column in columns:
output_row.append(column.text)
Table_data.append(output_row)
Table_headers = []
elem= Table_head[-1]
soup2= bs(elem.get_attribute('outerHTML'), 'html.parser')
for th in soup2.find_all('th'):
Table_headers.append(th.text)
df = pd.DataFrame(Table_data, columns=Table_headers)
Output:
Date Open High Low Close* Adj. close** Volume
0 22 Sept 2023 257.40 257.79 244.48 244.88 244.88 126,311,133
1 21 Sept 2023 257.85 260.86 254.21 255.70 255.70 119,531,000
2 20 Sept 2023 267.04 273.93 262.46 262.59 262.59 122,514,600
3 19 Sept 2023 264.35 267.85 261.20 266.50 266.50 103,704,000
4 18 Sept 2023 271.16 271.44 263.76 265.28 265.28 101,543,300
... ... ... ... ... ... ... ...
246 29 Sept 2022 282.76 283.65 265.78 268.21 268.21 77,620,600
247 28 Sept 2022 283.08 289.00 277.57 287.81 287.81 54,664,800
248 27 Sept 2022 283.84 288.67 277.51 282.94 282.94 61,925,200
249 26 Sept 2022 271.83 284.09 270.31 276.01 276.01 58,076,900
250 23 Sept 2022 283.09 284.50 272.82 275.33 275.33 63,748,400