python – rvest – and rselenium to extract dynamic dropdown menu items

I am trying to collect some menu information from a website but I am getting a little stuck on extracting correctly the dropdown menu items.

I want the following items:

enter image description here

etc. for each of the drop-down menus on the distritos page.

However, when we get down to centre badalona there is no drop-down menu so there is nothing to collect.

For example, the code below can get me the following output:

> collectZonaPageSnapshot %>% 
+   html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked.re-GeographicSearchNext-checkboxItem--has-separator')
{xml_nodeset (9)}
[1] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Artigues - Llefià" href="/es/comprar/viviendas/badalona/artigues-llefia/l"><div class="sui-MoleculeCh ...
[2] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Bonavista - Bufalà - Morera" href="/es/comprar/viviendas/badalona/bonavista-bufala-morera/l"><div cla ...
[3] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Canyet - Pomar" href="/es/comprar/viviendas/badalona/canyet-pomar/l"><div class="sui-MoleculeCheckbox ...
[4] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Casagemes - Canyadó" href="/es/comprar/viviendas/badalona/casagemes-canyado/l"><div class="sui-Molecu ...
[5] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Centre Badalona" href="/es/comprar/viviendas/badalona/centre-badalona/l"><div class="sui-MoleculeChec ...
[6] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Gorg - Progrés" href="/es/comprar/viviendas/badalona/gorg-progres/l"><div class="sui-MoleculeCheckbox ...
[7] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Montigalà - Sant Crist" href="/es/comprar/viviendas/badalona/montigala-sant-crist/l"><div class="sui- ...
[8] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Port" href="/es/comprar/viviendas/badalona/port/l"><div class="sui-MoleculeCheckboxField" style=""><d ...
[9] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Salut - Lloreda" href="/es/comprar/viviendas/badalona/salut-lloreda/l"><div class="sui-MoleculeCheckb ...
> collectZonaPageSnapshot %>% 
+   html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked') 
{xml_nodeset (31)}
 [1] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Artigues - Llefià" href="/es/comprar/viviendas/badalona/artigues-llefia/l"><div class="sui-MoleculeC ...
 [2] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Artigues" href="/es/comprar/viviendas/badalona/artigues/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeFie ...
 [3] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Llefià" href="/es/comprar/viviendas/badalona/llefia/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
 [4] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Sant Roc" href="/es/comprar/viviendas/badalona/sant-roc/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeFie ...
 [5] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Bonavista - Bufalà - Morera" href="/es/comprar/viviendas/badalona/bonavista-bufala-morera/l"><div cl ...
 [6] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Bonavista" href="/es/comprar/viviendas/badalona/bonavista/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeF ...
 [7] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Bufalà" href="/es/comprar/viviendas/badalona/bufala/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
 [8] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Morera" href="/es/comprar/viviendas/badalona/morera/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
 [9] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Canyet - Pomar" href="/es/comprar/viviendas/badalona/canyet-pomar/l"><div class="sui-MoleculeCheckbo ...
[10] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Canyet" href="/es/comprar/viviendas/badalona/canyet/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
[11] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Mas Ram" href="/es/comprar/viviendas/badalona/mas-ram/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[12] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Pomar" href="/es/comprar/viviendas/badalona/pomar/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField--in ...
[13] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Casagemes - Canyadó" href="/es/comprar/viviendas/badalona/casagemes-canyado/l"><div class="sui-Molec ...
[14] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Canyadó" href="/es/comprar/viviendas/badalona/canyado/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[15] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Casagemes" href="/es/comprar/viviendas/badalona/casagemes/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeF ...
[16] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Manresà" href="/es/comprar/viviendas/badalona/manresa/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[17] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Centre Badalona" href="/es/comprar/viviendas/badalona/centre-badalona/l"><div class="sui-MoleculeChe ...
[18] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Gorg - Progrés" href="/es/comprar/viviendas/badalona/gorg-progres/l"><div class="sui-MoleculeCheckbo ...
[19] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Congrés" href="/es/comprar/viviendas/badalona/congres/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[20] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="El Remei" href="/es/comprar/viviendas/badalona/el-remei/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeFie ...

The first part gives me the “parent” menus. The second part gives me the “parent” and “child” menus but I can’t distinguish between the parent and child menus.

Expected output:

To be able to extract the URLs, names etc. with a similar structure to the menu page.

- Artigues - Llefía
-- Artigues
-- Llefía
-- Sant Roc

-Bonavista -Bufalà - Morera
-- Bonavista
-- Bufalà
-- Morera

-Canyet - Pomar
-- Canyet
-- Mas Ram
-- Pomar

etc. (currently I can only get them in a “non-tree” format – ie I can’t tell which is the parent menu and which is the child menu)

> collectZonaPageSnapshot %>% 
+   html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked') %>% 
+   html_text()
 [1] "Artigues - Llefià851"           "Artigues38"                     "Llefià714"                      "Sant Roc99"                     "Bonavista - Bufalà - Morera233" "Bonavista34"                   
 [7] "Bufalà156"                      "Morera43"                       "Canyet - Pomar53"               "Canyet6"                        "Mas Ram29"                      "Pomar18"                       
[13] "Casagemes - Canyadó40"          "Canyadó9"                       "Casagemes29"                    "Manresà2"                       "Centre Badalona141"             "Gorg - Progrés267"             
[19] "Congrés32"                      "El Remei25"                     "Gorg69"                         "Progrés - Pep Ventura132"       "Montigalà - Sant Crist209"      "Montigalà21"                   
[25] "Puigfred86"                     "Sant Crist97"                   "Port79"                         "Salut - Lloreda592"             "La Salut399"                    "Lloreda133"                    
[31] "Sistrells60"

Code:

library(RSelenium)
library(rvest)
library(tidyverse)
distrito_url_to_get = "https://www.fotocasa.es/es/comprar/viviendas/badalona/todas-las-zonas/l"


rD <- rsDriver(browser="firefox", port=4536L)
remDr <- rD[["client"]]
remDr$navigate(distrito_url_to_get)
remDr$maxWindowSize()
# click "Accept"
remDr$findElement(using = "xpath",'/html/body/div[1]/div[4]/div/div/div/footer/div/button[2]')$clickElement()
#click on Distrito
remDr$findElement(using = "xpath", '/html/body/div[1]/div[2]/div[1]/div[3]/div/div[1]/div')$clickElement()

# click each of the boxes to "activate the HTML page
#distritoDropDownElements = remDr$findElements(using = 'css selector', '.sui-MoleculeCheckboxField')
distritoDropDownToggleIconElements = remDr$findElements(using = 'css selector', '.sui-MoleculeCheckboxField-toggleIcon')
for(i in 1:length(distritoDropDownElements)){
  distritoDropDownElements[[i]]$clickElement()
}

# read in the HTML page
collectZonaPageSnapshot = remDr$getPageSource()[[1]] %>% 
  read_html()

# part 1) -collect the parent menus
collectZonaPageSnapshot %>% 
  html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked.re-GeographicSearchNext-checkboxItem--has-separator') 

# part 2) -collect the child menus
collectZonaPageSnapshot %>% 
  html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked') 

Leave a Comment