The package at present scrapes and extracts information about real estate adds from a French website.
After installation, the package can be attached in the regular way
library(estate)
The website allows displaying at maximum 40 ads per page. The URLs of the first 21 pages are irregular:
page = 1
: the page number does not appear at the end of the URL.page < 22
: variable component after “annonce/”.page > 1
: page number in URL.createUrl(page = 1, provider = "papfr", type = "vente")
## [1] "http://www.pap.fr/annonce/vente-appartements-paris-75-g439-40-annonces-par-page"
createUrl(page = 2, provider = "papfr", type = "vente")
## [1] "http://www.pap.fr/annonce/appartement-a-vendre-paris-75-g439-40-annonces-par-page-2"
createUrl(page = 22, provider = "papfr", type = "vente")
## [1] "http://www.pap.fr/annonce/vente-appartement-paris-75-g439-40-annonces-par-page-22"
The various low-level extraction functions make use of the XML
package to access nodes in the HTML page structure (elements like <div>
, <p>
).
url <- system.file("extdata", "papfr-40.html", package = "estate")
doc <- XML::htmlParse(url, encoding = "utf-8")
nobs <- 2
estate:::extractLinkVignette(doc)[1:nobs]
## [1] "http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414901156"
## [2] "http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414801400"
estate:::extractLinkPhoto(doc)[1:nobs]
## [1] "http://static.pap.fr/photos/B49/B49A1156.thumb.jpg"
## [2] "http://static.pap.fr/photos/B48/B48A1400.thumb.jpg"
(description <- estate:::extractDescription(doc)[1:nobs])
## [1] "Paris 14e (75014). Paris 14 ième, à deux stations de Montparnasse, Studette de 11,50 m² (11,43 Loi Carrez), au premier étage. En bon état. Salle d'eau avec douche et vraies toilettes. Bloc évier cuisinette (frigidaire, plaques,..."
## [2] "Paris 6e (75006). Charment 2 pièces au calme sur cour et lumineux, situé au 4e étage sans ascenseur, en plein coeur de Saint germain des prés. Il est composé d'un salon, 1 chambre, 1 cuisine ouverte sur salon entièrement équipée, un..."
estate:::extractCP(description)
## [1] "75014" "75006"
estate:::extractSummaryTable(doc)[1:nobs,]
## rooms bedr size
## 1 1 NA 11
## 2 2 1 37
estate:::extractPrice(doc)[1:nobs]
## [1] 103000 560000
estate:::extractDate(doc)[1:nobs]
## [1] "2017-01-23" "2017-01-23"
estate_table <- extractList(url)
knitr::kable(estate_table[1:nobs, ])
price | photo | description | link | rooms | bedr | size | location | date |
---|---|---|---|---|---|---|---|---|
103000 | http://static.pap.fr/photos/B49/B49A1156.thumb.jpg | Paris 14e (75014). Paris 14 ième, à deux stations de Montparnasse, Studette de 11,50 m² (11,43 Loi Carrez), au premier étage. En bon état. Salle d’eau avec douche et vraies toilettes. Bloc évier cuisinette (frigidaire, plaques,… | http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414901156 | 1 | NA | 11 | 75014 | 2017-01-23 |
560000 | http://static.pap.fr/photos/B48/B48A1400.thumb.jpg | Paris 6e (75006). Charment 2 pièces au calme sur cour et lumineux, situé au 4e étage sans ascenseur, en plein coeur de Saint germain des prés. Il est composé d’un salon, 1 chambre, 1 cuisine ouverte sur salon entièrement équipée, un… | http://www.pap.fr/annonce/ventes-appartements-paris-75-g439-40-annonces-par-page-3-r414801400 | 2 | 1 | 37 | 75006 | 2017-01-23 |
In order to achieve good download performance, the curl
package is used. This package is designed to make parallel calls to the same host. This way, we can retrieve the maximum number of 50 HTML pages efficiently. The download location “../inst/extdata/vente” assumes you have extracted the package source and attempt to build the vignette using devtools::build_vignettes()
. Create the subfolder “vente” in case it does not exist. In order to make the files visible to the system.file
function, a re-install is required devtools::install()
.
type <- "vente"
## type <- "location"
htmldir <- system.file("extdata", type, package = "estate")
npages <- 50
## set eval=TRUE to activate download
## htmldir_temp <- file.path("../inst/extdata", type)
## setwd("vignettes")
htmldir_temp <- file.path("../inst/extdata", type) # run from /docs/articles folder
unlink(file.path(htmldir_temp, list.files(htmldir_temp)))
downloadHtml(type = type, pages = c(1:npages), htmldir = htmldir_temp)
devtools::install() # reads data from install dir
combineEstate <- function(type, htmldir, pages=1) {
htmlfiles <- list.files(htmldir)[1:pages]
estate_list <-
lapply(file.path(htmldir, htmlfiles), extractList)
estate_df <- do.call("rbind", estate_list)
return(estate_df)
}
estatedf <- combineEstate(type = type,
htmldir = htmldir,
pages = npages)
datenow <- format(Sys.Date(), format = "%Y-%m-%d")
## datenow <- "2017-01-30"
exportfile <- file.path("/tmp", paste0(datenow, "-estate-", type, ".tsv"))
write.table(x = estatedf, file = exportfile, row.names = FALSE, sep = "\t")
if (type=="vente") {
price_min <- 2*10^5; price_max <- 5*10^5
} else if (type=="location") {
price_min <- 9*10^2; price_max <- 16*10^2
}
estatedf <- subset(estatedf, price < price_max & price > price_min)
## estatedf <- subset(estatedf, price < price_max & price > price_min & date > "2017-09-09")
nrow(estatedf)
## [1] 702
estatedf$price_per_sqm <- estatedf$price / estatedf$size
intcols <- names(estatedf)[lapply(estatedf, class)%in%c("integer", "numeric")]
summary(estatedf[,colnames(estatedf) %in% intcols])
## price rooms bedr size
## Min. :230000 Min. :1.000 Min. :1.000 Min. :16.00
## 1st Qu.:290000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:24.00
## Median :367500 Median :2.000 Median :1.000 Median :30.00
## Mean :365084 Mean :1.678 Mean :1.031 Mean :34.25
## 3rd Qu.:430000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:44.00
## Max. :499000 Max. :4.000 Max. :3.000 Max. :79.00
## NA's :287 NA's :46
## price_per_sqm
## Min. : 4684
## 1st Qu.: 9281
## Median :10213
## Mean :11052
## 3rd Qu.:13464
## Max. :15000
## NA's :46
library(ggplot2)
library(ggiraph)
# create an 'onclick' column
estatedf$onclick <- sprintf("window.open(\"%s\")", estatedf$link)
estatedf$tooltip <- sprintf("<img src=\"%s\"/>", estatedf$photo)
gg_base <-
ggplot(estatedf, aes( x = price, y = size, color = factor(location)) ) +
## scale_colour_hue(h = c(0, 90)) +
theme_minimal()
gg_interactive <-
gg_base +
## geom_smooth(method = "lm") +
geom_point_interactive(aes(tooltip = tooltip, onclick = onclick), size = 2)
ggiraph(code = print(gg_interactive), width = 1, width_svg = 7) #, zoom_max = 5)
## Warning in ggiraph(code = print(gg_interactive), width = 1, width_svg = 7):
## argument 'width' is deprecated and will have no effect.
## Warning: Removed 46 rows containing missing values
## (geom_interactive_point).