By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information.
My website is a bit different and I got result. But it is in a different format.
code:
from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup
cookies = {
'CFID': '180615757',
'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
'_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
'_ga': 'GA1.2.147261521.1662080801',
'_gid': 'GA1.2.1149490171.1662080801',
'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=',
'__atuvc': '65%7C35%2C2%7C36',
'COOKIESTATUS': 'ON',
'HIDECOOKIEBANNER': 'TRUE',
'nlbi_2388351': 'jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7',
'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
'incap_ses_989_2388351': 'mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
'__atuvs': '6314ec0cdbe92a78001',
'_gat_gtag_UA_12825325_1': '1',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.higheredjobs.com/admin/',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=; __atuvc=65%7C35%2C2%7C36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
# Requests doesn't support trailers
# 'TE': 'trailers',
}
params = {
'JobCat': '141',
'CatName': 'Academic Advising',
}
response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
name = i.text
jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})
Present output:
df =
Jobs title
0 \n\nRe-Sort\n\n\r\n\t\t\tResults 1 - 70 of 70\...
1 \n\n\r\n\t\t\t\t\t\t\t\t\t\t\tAssistant Profes...
2 \r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t...
Expected output:
df =
Jobs title Company name location Posted
0 Assistant Professor/Associate University of Southern Indiana Evansville, IN 09/02/22
Professor of Engineering,
Pott College of Science,
Engineering, and Education - F22057F1
>Solution :
Main issue is that you try to create your DataFrame from unstructured data, that is collected in your list.
So try to structure it first e.g. as dict, append it to your list and then create your DataFrame:
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Note: If you like to change the headers, change this list -> [‘title’,’university’,’location’,’study’,’date’]
Example
from bs4 import BeautifulSoup
html ='''
<div class="row record">
<div class="col-sm-7"><a href="details.cfm?JobCode=178085874&Title=Assistant%20Professor%2FAssociate%20Professor%20of%20Engineering%2C%20Pott%20College%20of%20Science%2C%20Engineering%2C%20and%20Education%20%2D%20F22057F1">
Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
University of Southern Indiana <br/>
Evansville, IN
</div>
<div class="col-sm-5 text-sm-right">
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
<div class="row record">
<div class="col-sm-7">
<a href="details.cfm?JobCode=178085843&Title=Assistant%20Professor%20of%20Engineering%20F99507">
Assistant Professor of Engineering F99507</a>
<br/>
McNeese State University <br/>
Lake Charles, LA
</div>
<div class="col-sm-5 text-sm-right">
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Output
| title | university | location | study | date | |
|---|---|---|---|---|---|
| 0 | Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education – F22057F1 | University of Southern Indiana | Evansville, IN | Electrical Engineering | Posted 09/02/22 |
| 1 | Assistant Professor of Engineering F99507 | McNeese State University | Lake Charles, LA | Electrical Engineering | Posted 09/02/22 |