The list element variable is not working as an argument in soup.find() but works if I hard code it in

The variable headline_html comes back as None when I try to use the element_target element as an argument in soup.find().

If I hard code the element_target string in soup.find(), it works. Like this: soup.find("h2", class_="story__title"). And printing out element_string gives this string.

There is no issue with the html target.

I’ve tried reversing the " and ‘ in paper[3] to no avail. setting it as a variable before using it in soup.find() also to no avail.

What am I missing?

from difflib import restore
from venv import create
from requests import get
import random
import psycopg2
from bs4 import BeautifulSoup
import database
import requests
import datetime

papers = [
    [1, "https://www.mirror.co.uk/", "The Daily Mirror", '"h2", class_="story__title"'],
    [2, "https://www.theguardian.com/uk", "The Guardian", "'span', class_='js-headline-text'"],
    [3, "https://www.thesun.co.uk/", "The Sun", "'p', class_='teaser__subdeck'"],
    [4, "https://www.ft.com/world/uk", "The Financial Times", "'div', class_='o-teaser__heading'"],
    [5, "https://www.dailymail.co.uk/home/index.html", "The Daily Mail", "'h2', class_='linkro-darkred'"],
    [6, "https://www.thetimes.co.uk/uk", "The Times", "'h3', class_='Headline--xl'"]

]    

def scrapeHeadlines():
    scrape_results = []

    randomUrls = [ 
    "https://www.facebook.com/", 
    "https://www.google.co.uk", 
    "https://www.twitter.com"
    ]

    headers = {
        'User-Agent': 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'Referer': random.choice(randomUrls) 
        }

    fail = "Sorry but we could not get the headline for " + str(paper[2])
    timestamp = '{:%b-%d-%Y %H:%M:%S}'.format(datetime.datetime.now())
    id = paper[0]
    url = paper[1]
    newspaper = paper[2]
    element_target = paper[3]

    results = requests.get(url, headers=headers)
    soup = BeautifulSoup(results.text, "html.parser") # this can be printed out and works
    # headline_html = soup.find(paper[3])
    headline_html = soup.find(element_target) # this is not working like this

    if headline_html != None:
      headline = headline_html.text.strip()
    else:
      headline = fail 
    
    scrape_results.append({
                    'id': id,
                    'paper': newspaper,
                    'headline': headline,
                    'headline_html': headline_html
                    })
    print(scrape_results)

for paper in papers:
    scrapeHeadlines()

The result is:

{'id': 1, 'paper': 'The Daily Mirror', 'headline': 'Sorry but we could not get the headline for The Daily Mirror', 'headline_html': None}]
[{'id': 2, 'paper': 'The Guardian', 'headline': 'Sorry but we could not get the headline for The Guardian', 'headline_html': None}]
[{'id': 3, 'paper': 'The Sun', 'headline': 'Sorry but we could not get the headline for The Sun', 'headline_html': None}]
[{'id': 4, 'paper': 'The Financial Times', 'headline': 'Sorry but we could not get the headline for The Financial Times', 'headline_html': None}]
[{'id': 5, 'paper': 'The Daily Mail', 'headline': 'Sorry but we could not get the headline for The Daily Mail', 'headline_html': None}]
[{'id': 6, 'paper': 'The Times', 'headline': 'Sorry but we could not get the headline for The Times', 'headline_html': None}]

>Solution :

You’ve stringified the parameters of find(). You need to split these out:

papers = [
    [1, "https://www.mirror.co.uk/", "The Daily Mirror", "h2", "story__title"],
   ...
]

def scrapeHeadlines(paper):
    ...
    element_target = paper[3]
    class_ = paper[4]
    headline_html = soup.find(element_target, class_=class_)
    ...

for paper in papers:
    scrapeHeadlines(paper)

Leave a Reply