The package at present scrapes and extracts information about real estate adds from a French website.

After installation, the package can be attached in the regular way

library(estate)

Create URLs

The website allows displaying at maximum 40 ads per page. The URLs of the first 21 pages are irregular:

  • page = 1: the page number does not appear at the end of the URL.
  • page < 22: variable component after “annonce/”.
  • page > 1: page number in URL.
createUrl(page = 1, provider = "papfr", type = "vente")
## [1] "http://www.pap.fr/annonce/vente-appartements-paris-75-g439-40-annonces-par-page"
createUrl(page = 2, provider = "papfr", type = "vente")
## [1] "http://www.pap.fr/annonce/appartement-a-vendre-paris-75-g439-40-annonces-par-page-2"
createUrl(page = 22, provider = "papfr", type = "vente")
## [1] "http://www.pap.fr/annonce/vente-appartement-paris-75-g439-40-annonces-par-page-22"

Extract from HTML files

The various low-level extraction functions make use of the XML package to access nodes in the HTML page structure (elements like <div>, <p>).

  • URL for complete description
  • cover picture link
  • partial description
  • summary: number of rooms and bedrooms, surface in m2
  • price in Euros
  • zip code
  • date
url <- system.file("extdata", "papfr-40.html", package = "estate")
doc <- XML::htmlParse(url, encoding = "utf-8")
nobs <- 2
estate:::extractLinkVignette(doc)[1:nobs]
## [1] "http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414901156"
## [2] "http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414801400"
estate:::extractLinkPhoto(doc)[1:nobs]
## [1] "http://static.pap.fr/photos/B49/B49A1156.thumb.jpg"
## [2] "http://static.pap.fr/photos/B48/B48A1400.thumb.jpg"
(description <- estate:::extractDescription(doc)[1:nobs])
## [1] "Paris 14e (75014). Paris 14 ième, à deux stations de Montparnasse, Studette de 11,50 m² (11,43 Loi Carrez), au premier étage. En bon état. Salle d'eau avec douche et vraies toilettes. Bloc évier cuisinette (frigidaire, plaques,..."     
## [2] "Paris 6e (75006). Charment 2 pièces au calme sur cour et lumineux, situé au 4e étage sans ascenseur, en plein coeur de Saint germain des prés. Il est composé d'un salon, 1 chambre, 1 cuisine ouverte sur salon entièrement équipée, un..."
estate:::extractCP(description)
## [1] "75014" "75006"
estate:::extractSummaryTable(doc)[1:nobs,]
##   rooms bedr size
## 1     1   NA   11
## 2     2    1   37
estate:::extractPrice(doc)[1:nobs]
## [1] 103000 560000
estate:::extractDate(doc)[1:nobs]
## [1] "2017-01-23" "2017-01-23"
estate_table <- extractList(url)
knitr::kable(estate_table[1:nobs, ])
price photo description link rooms bedr size location date
103000 http://static.pap.fr/photos/B49/B49A1156.thumb.jpg Paris 14e (75014). Paris 14 ième, à deux stations de Montparnasse, Studette de 11,50 m² (11,43 Loi Carrez), au premier étage. En bon état. Salle d’eau avec douche et vraies toilettes. Bloc évier cuisinette (frigidaire, plaques,… http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414901156 1 NA 11 75014 2017-01-23
560000 http://static.pap.fr/photos/B48/B48A1400.thumb.jpg Paris 6e (75006). Charment 2 pièces au calme sur cour et lumineux, situé au 4e étage sans ascenseur, en plein coeur de Saint germain des prés. Il est composé d’un salon, 1 chambre, 1 cuisine ouverte sur salon entièrement équipée, un… http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414801400 2 1 37 75006 2017-01-23

Pooled download HTML files

In order to achieve good download performance, the curl package is used. This package is designed to make parallel calls to the same host. This way, we can retrieve the maximum number of 50 HTML pages efficiently. The download location “../inst/extdata/vente” assumes you have extracted the package source and attempt to build the vignette using devtools::build_vignettes(). Create the subfolder “vente” in case it does not exist. In order to make the files visible to the system.file function, a re-install is required devtools::install().

type <- "vente"
## type <- "location"
htmldir <- system.file("extdata", type, package = "estate")
npages <- 50
## set eval=TRUE to activate download
## htmldir_temp <- file.path("../inst/extdata", type)
## setwd("vignettes")
htmldir_temp <- file.path("../inst/extdata", type) # run from /docs/articles folder
unlink(file.path(htmldir_temp, list.files(htmldir_temp)))
downloadHtml(type = type, pages = c(1:npages), htmldir = htmldir_temp)
devtools::install() # reads data from install dir

Vectorized extraction from HTML

combineEstate <- function(type, htmldir, pages=1) {
  htmlfiles <- list.files(htmldir)[1:pages]
  estate_list <-
    lapply(file.path(htmldir, htmlfiles), extractList)
  estate_df <- do.call("rbind", estate_list)
  return(estate_df)
}

estatedf <- combineEstate(type = type,
                          htmldir = htmldir,
                          pages = npages)
datenow <- format(Sys.Date(), format = "%Y-%m-%d")
## datenow <- "2017-01-30"
exportfile <- file.path("/tmp", paste0(datenow, "-estate-", type, ".tsv"))
write.table(x = estatedf, file = exportfile, row.names = FALSE, sep = "\t")
if (type=="vente") {
  price_min <-  2*10^5; price_max <- 5*10^5
} else if (type=="location") {
  price_min <-  9*10^2; price_max <- 16*10^2
}
estatedf <- subset(estatedf, price < price_max & price > price_min)
## estatedf <- subset(estatedf, price < price_max & price > price_min & date > "2017-09-09")
nrow(estatedf)
## [1] 702
estatedf$price_per_sqm <- estatedf$price / estatedf$size

intcols <- names(estatedf)[lapply(estatedf, class)%in%c("integer", "numeric")]
summary(estatedf[,colnames(estatedf) %in% intcols])
##      price            rooms            bedr            size      
##  Min.   :230000   Min.   :1.000   Min.   :1.000   Min.   :16.00  
##  1st Qu.:290000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:24.00  
##  Median :367500   Median :2.000   Median :1.000   Median :30.00  
##  Mean   :365084   Mean   :1.678   Mean   :1.031   Mean   :34.25  
##  3rd Qu.:430000   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:44.00  
##  Max.   :499000   Max.   :4.000   Max.   :3.000   Max.   :79.00  
##                                   NA's   :287     NA's   :46     
##  price_per_sqm  
##  Min.   : 4684  
##  1st Qu.: 9281  
##  Median :10213  
##  Mean   :11052  
##  3rd Qu.:13464  
##  Max.   :15000  
##  NA's   :46
library(ggplot2)
library(ggiraph)

# create an 'onclick' column
estatedf$onclick <- sprintf("window.open(\"%s\")", estatedf$link)
estatedf$tooltip <- sprintf("<img src=\"%s\"/>", estatedf$photo)

gg_base <-
  ggplot(estatedf, aes( x = price, y = size, color = factor(location)) ) +
  ## scale_colour_hue(h = c(0, 90)) +
  theme_minimal()

gg_interactive <-
  gg_base +
  ## geom_smooth(method = "lm") +
  geom_point_interactive(aes(tooltip = tooltip, onclick = onclick), size = 2)

ggiraph(code = print(gg_interactive), width = 1, width_svg = 7) #, zoom_max = 5)
## Warning in ggiraph(code = print(gg_interactive), width = 1, width_svg = 7):
## argument 'width' is deprecated and will have no effect.
## Warning: Removed 46 rows containing missing values
## (geom_interactive_point).