Follow

Follow

Contact

Home Scrape only 1 page I want to scrape multiple pages with selenium

Questions

Scrape only 1 page I want to scrape multiple pages with selenium

byMR

June 22, 2022

I am trying to scrape multiple pages with selenium but they will scrape only 1 page what mistake I will do is there any solution then provide us this is the page link https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)


def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        
        URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?qvrtqca=&filters%5Brechtsgebieden%5D=%5B%5D&ypb=&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D=%7B%22lat%22%3A%2252.132633%22%2C%22lng%22%3A%225.291266%22%7D&locatie%5Bstraal%5D=56&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Bhash%5D='
        driver.get(URL)
        time.sleep(3)
        page=1
        page_links = [element.get_attribute('href') for element in
                    driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]

        data=[]
        for link in page_links:
            wev={}
            driver.get(link)
            time.sleep(2)
            try:
                title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
            except:
                pass
            wev['title']=title
            
            try:
                advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
            except:
                pass
            
            wev['advocaten']=advocaten
            
            details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
            for detail in details:
                
                try:
                    address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
                except:
                    pass
                wev['address']=address
                try:
                    email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
                except:
                    pass
                wev['email']=email
                try:
                    website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
                except:
                    pass
                
                wev['website']=website
                
                data.append(wev)
                
            if len(driver.find_elements_by_xpath("//a[@class='button next']")) > 0:
                url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={}".format(page)
                driver.get(url)
                page += 1
                if int(page)>5:
                    break
                else:
                    break           
    df=pd.DataFrame(data)
    print(df)

>Solution :

You can make the pagination in starting url using for loop as follows:

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options)

data=[]
def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        
        URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
        for page in range(1,11):

            driver.get(URL.format(page=page))
            time.sleep(3)
        
            page_links = [element.get_attribute('href') for element in driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]

            
            for link in page_links:
                wev={}
                driver.get(link)
                time.sleep(2)
                try:
                    title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
                except:
                    pass
                wev['title']=title
            
                try:
                    advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
                except:
                    pass
            
                wev['advocaten']=advocaten
            
                details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
                for detail in details:
                
                    try:
                        address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
                    except:
                        pass
                    wev['address']=address
                    try:
                        email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
                    except:
                        pass
                    wev['email']=email
                    try:
                        website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
                    except:
                        pass
                
                    wev['website']=website
                
                    data.append(wev)
                

df=pd.DataFrame(data)
print(df)

You also can try:

URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
        for page in range(1,11):
            url=URL.format(page=page)

            driver.get(url)

web-scraping

byMR

Published June 22, 2022

Add a comment

Leave a ReplyCancel reply

Read more

Questions

How to rewrite 3 ngIf to ngSwitch

byMR

June 22, 2022

Questions

Android Bundling failed in expo-cli when we run "npm start" or scan expo app in mobile

byMR

June 22, 2022

Questions

How to display container that is centered with content that is in the far left and far right of it

byMR

June 22, 2022

Questions

Why isn't my subclass inheriting a method from the superclass?

byMR

June 22, 2022

Questions

Nodejs – How to prevent sql injection [using ONLY vanilla mysql library]

byMR

June 22, 2022

Questions

Haskell warning

byMR

June 22, 2022