The list element variable is not working as an argument in soup.find() but works if I hard code it in

The variable headline_html comes back as None when I try to use the element_target element as an argument in soup.find().

If I hard code the element_target string in soup.find(), it works. Like this: soup.find("h2", class_="story__title"). And printing out element_string gives this string.

There is no issue with the html target.

I’ve tried reversing the " and ‘ in paper[3] to no avail. setting it as a variable before using it in soup.find() also to no avail.

What am I missing?

from difflib import restore
from venv import create
from requests import get
import random
import psycopg2
from bs4 import BeautifulSoup
import database
import requests
import datetime

papers = [
    [1, "", "The Daily Mirror", '"h2", class_="story__title"'],
    [2, "", "The Guardian", "'span', class_='js-headline-text'"],
    [3, "", "The Sun", "'p', class_='teaser__subdeck'"],
    [4, "", "The Financial Times", "'div', class_='o-teaser__heading'"],
    [5, "", "The Daily Mail", "'h2', class_='linkro-darkred'"],
    [6, "", "The Times", "'h3', class_='Headline--xl'"]


def scrapeHeadlines():
    scrape_results = []

    randomUrls = [ 

    headers = {
        'User-Agent': 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'Referer': random.choice(randomUrls) 

    fail = "Sorry but we could not get the headline for " + str(paper[2])
    timestamp = '{:%b-%d-%Y %H:%M:%S}'.format(
    id = paper[0]
    url = paper[1]
    newspaper = paper[2]
    element_target = paper[3]

    results = requests.get(url, headers=headers)
    soup = BeautifulSoup(results.text, "html.parser") # this can be printed out and works
    # headline_html = soup.find(paper[3])
    headline_html = soup.find(element_target) # this is not working like this

    if headline_html != None:
      headline = headline_html.text.strip()
      headline = fail 
                    'id': id,
                    'paper': newspaper,
                    'headline': headline,
                    'headline_html': headline_html

for paper in papers:

The result is:

{'id': 1, 'paper': 'The Daily Mirror', 'headline': 'Sorry but we could not get the headline for The Daily Mirror', 'headline_html': None}]
[{'id': 2, 'paper': 'The Guardian', 'headline': 'Sorry but we could not get the headline for The Guardian', 'headline_html': None}]
[{'id': 3, 'paper': 'The Sun', 'headline': 'Sorry but we could not get the headline for The Sun', 'headline_html': None}]
[{'id': 4, 'paper': 'The Financial Times', 'headline': 'Sorry but we could not get the headline for The Financial Times', 'headline_html': None}]
[{'id': 5, 'paper': 'The Daily Mail', 'headline': 'Sorry but we could not get the headline for The Daily Mail', 'headline_html': None}]
[{'id': 6, 'paper': 'The Times', 'headline': 'Sorry but we could not get the headline for The Times', 'headline_html': None}]

>Solution :

You’ve stringified the parameters of find(). You need to split these out:

papers = [
    [1, "", "The Daily Mirror", "h2", "story__title"],

def scrapeHeadlines(paper):
    element_target = paper[3]
    class_ = paper[4]
    headline_html = soup.find(element_target, class_=class_)

for paper in papers:

Leave a Reply