I am trying to webscrape this link in Python. The ideal output is a dataframe with 4 columns: date, author, title and text. So far, I got down to author, title and date in the following way:
from bs4 import BeautifulSoup
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
print(req)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
date = card.select('.item_date')
title = card.select_one('.title a').get_text()
author = card.select_one('.authorlnk.dashed').get_text().strip()
data.append({
'date': date,
'title':title,
'author':author
})
print(data)
Now, I find hard to extract the text for each of the 10 links in the page. I am doing the following:
data = []
for link in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{link['href']}").content,features="lxml")
data.append({
'Text': ''.join([str(e) for e in r.select('p')])})
However, I am not getting any good results around that code.
Can anyone help me with that?
Thanks!
>Solution :
You are close to your goal, simply handle the requests to the texts in your for loop:
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
Example
from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
pd.DataFrame(data)