Python scrap useful information from webpage with login

September 4, 2022

By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information.
My website is a bit different and I got result. But it is in a different format.
code:

from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup

cookies = {
    'CFID': '180615757',
    'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
    'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
    '_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
    '_ga': 'GA1.2.147261521.1662080801',
    '_gid': 'GA1.2.1149490171.1662080801',
    'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=',
    '__atuvc': '65%7C35%2C2%7C36',
    'COOKIESTATUS': 'ON',
    'HIDECOOKIEBANNER': 'TRUE',
    'nlbi_2388351': 'jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7',
    'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
    'incap_ses_989_2388351': 'mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
    'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
    '__atuvs': '6314ec0cdbe92a78001',
    '_gat_gtag_UA_12825325_1': '1',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.higheredjobs.com/admin/',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=; __atuvc=65%7C35%2C2%7C36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

params = {
    'JobCat': '141',
    'CatName': 'Academic Advising',
}

response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)

soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
 name = i.text
 jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})

Present output:

df = 
Jobs title
0   \n\nRe-Sort\n\n\r\n\t\t\tResults 1 - 70 of 70\...
1   \n\n\r\n\t\t\t\t\t\t\t\t\t\t\tAssistant Profes...
2   \r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t...

Expected output:

df = 
     Jobs title                     Company name                   location         Posted
0   Assistant Professor/Associate  University of Southern Indiana  Evansville, IN   09/02/22
    Professor of Engineering, 
    Pott College of Science, 
    Engineering, and Education - F22057F1

>Solution :

Main issue is that you try to create your DataFrame from unstructured data, that is collected in your list.

So try to structure it first e.g. as dict, append it to your list and then create your DataFrame:

jobs_list = []
for i in soup.select('.row.record'):
    
    jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))

pd.DataFrame(jobs_list)

Note: If you like to change the headers, change this list -> [‘title’,’university’,’location’,’study’,’date’]

Example

from bs4 import BeautifulSoup
html ='''
<div class="row record">
<div class="col-sm-7"><a href="details.cfm?JobCode=178085874&amp;Title=Assistant%20Professor%2FAssociate%20Professor%20of%20Engineering%2C%20Pott%20College%20of%20Science%2C%20Engineering%2C%20and%20Education%20%2D%20F22057F1">
                                            Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
                                        University of Southern Indiana <br/>
                                            Evansville, IN 
                                    </div>
<div class="col-sm-5 text-sm-right">
                                        
                                        Electrical Engineering 
                                            <br/> Posted 09/02/22<br/>
</div>
</div>
<div class="row record">
<div class="col-sm-7">
<a href="details.cfm?JobCode=178085843&amp;Title=Assistant%20Professor%20of%20Engineering%20F99507">
                                            Assistant Professor of Engineering F99507</a>
<br/>
                                        McNeese State University <br/>
                                            Lake Charles, LA 
                                    </div>
<div class="col-sm-5 text-sm-right">
                                        
                                        Electrical Engineering 
                                            <br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)

jobs_list = []
for i in soup.select('.row.record'):
    
    jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))

pd.DataFrame(jobs_list)

Output

	title	university	location	study	date
0	Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education – F22057F1	University of Southern Indiana	Evansville, IN	Electrical Engineering	Posted 09/02/22
1	Assistant Professor of Engineering F99507	McNeese State University	Lake Charles, LA	Electrical Engineering	Posted 09/02/22