I need to scrape data from this link: https://flo.uri.sh/visualisation/15396850/embed?auto=1
This is my Python code:
import pandas as pd
import html.parser
import requests
import urllib.request
from selenium import webdriver
base_url = "https://flo.uri.sh/visualisation/15396850/embed?auto=1"
class MyParser(html.parser.HTMLParser):
def __init__(self, html):
self.matches = []
self.match_count = 0
super().__init__()
def handle_data(self, data):
self.matches.append(data)
self.match_count += 1
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "div":
if attrs.get("fl-layout-primary-container"):
self.handle_data()
else: return
url = base_url.format(page_number)
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
parser = MyParser(html)
parser.feed(html)
for item in parser.matches:
print(item)
The output I get has the metadata for the table. I have not been able to figure out how to convert that into a csv table.
This is what my output looks like.
>Solution :
The data you’re looking for is embedded as Json inside the HTML page. To get them to pandas dataframe, save it as CSV you can use next example:
import json
import re
import pandas as pd
import requests
url = "https://flo.uri.sh/visualisation/15396850/embed?auto=1"
html_source = requests.get(url).text
column_names = json.loads(
re.search(r"_Flourish_data_column_names = (.*})", html_source).group(1)
)
data = json.loads(re.search(r"_Flourish_data = (.*})", html_source).group(1))
all_data = []
for r in data["rows"]:
all_data.append(r["columns"])
df = pd.DataFrame(all_data, columns=column_names["rows"]["columns"])
print(df.to_csv("data.csv", index=False))
Saves data.csv (screenshot from LibreOffice):
