library(tidyverse)
## ─ Attaching packages ──────────────────── tidyverse 1.3.1 ─
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ─ Conflicts ───────────────────── tidyverse_conflicts() ─
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ─ Attaching packages ─────────────────── tidymodels 0.1.3 ─
## ✓ broom 0.7.9 ✓ rsample 0.1.0
## ✓ dials 0.0.10 ✓ tune 0.1.6
## ✓ infer 1.0.0 ✓ workflows 0.2.3
## ✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
## ✓ parsnip 0.1.7 ✓ yardstick 0.0.8
## ✓ recipes 0.1.17
## ─ Conflicts ───────────────────── tidymodels_conflicts() ─
## x dplyr::between() masks data.table::between()
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x purrr::transpose() masks data.table::transpose()
## • Use tidymodels_prefer() to resolve common conflicts.
library(skimr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidytext)
train_raw <- read_csv("train.csv")
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 10000 Columns: 16
## ─ Column specification ────────────────────────────
## Delimiter: ","
## chr (4): city, description, homeType, priceRange
## dbl (11): uid, latitude, longitude, garageSpaces, yearBuilt, numOfPatioAndPo...
## lgl (1): hasSpa
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_raw %>%
count(priceRange)
## # A tibble: 5 × 2
## priceRange n
## <chr> <int>
## 1 0-250000 1249
## 2 250000-350000 2356
## 3 350000-450000 2301
## 4 450000-650000 2275
## 5 650000+ 1819
price_plot <-
train_raw %>%
mutate(priceRange = parse_number(priceRange)) %>%
ggplot(aes(longitude, latitude, z = priceRange)) +
stat_summary_hex(alpha = 0.8, bins = 50) +
scale_fill_viridis_c() +
labs(
fill = "mean",
title = "Price"
)
price_plot
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
plot_austin <- function(var, title) {
train_raw %>%
ggplot(aes(longitude, latitude, z = {{ var }})) +
stat_summary_hex(alpha = 0.8, bins = 50) +
scale_fill_viridis_c() +
labs(
fill = "mean",
title = title
)
}
(price_plot + plot_austin(avgSchoolRating, "School rating")) /
(plot_austin(yearBuilt, "Year built") + plot_austin(log(lotSizeSqFt), "Lot size (log)"))
library(tidytext)
austin_tidy <-
train_raw %>%
mutate(priceRange = parse_number(priceRange) + 100000) %>%
unnest_tokens(word, description) %>%
anti_join(get_stopwords())
## Joining, by = "word"
austin_tidy %>%
count(word, sort = TRUE)
## # A tibble: 17,944 × 2
## word n
## <chr> <int>
## 1 home 11620
## 2 kitchen 5721
## 3 room 5494
## 4 austin 4918
## 5 new 4772
## 6 large 4771
## 7 2 4585
## 8 bedrooms 4571
## 9 contains 4413
## 10 3 4386
## # … with 17,934 more rows
top_words <-
austin_tidy %>%
count(word, sort = TRUE) %>%
filter(!word %in% as.character(1:5)) %>%
slice_max(n, n = 100) %>%
pull(word)
word_freqs <-
austin_tidy %>%
count(word, priceRange) %>%
complete(word, priceRange, fill = list(n = 0)) %>%
group_by(priceRange) %>%
mutate(
price_total = sum(n),
proportion = n / price_total
) %>%
ungroup() %>%
filter(word %in% top_words)
word_freqs
## # A tibble: 500 × 5
## word priceRange n price_total proportion
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 access 100000 180 56290 0.00320
## 2 access 350000 365 114853 0.00318
## 3 access 450000 322 116678 0.00276
## 4 access 550000 294 125585 0.00234
## 5 access 750000 248 112073 0.00221
## 6 appliances 100000 209 56290 0.00371
## 7 appliances 350000 583 114853 0.00508
## 8 appliances 450000 576 116678 0.00494
## 9 appliances 550000 567 125585 0.00451
## 10 appliances 750000 391 112073 0.00349
## # … with 490 more rows
word_mods <-
word_freqs %>%
nest(data = c(priceRange, n, price_total, proportion)) %>%
mutate(
model = map(data, ~ glm(cbind(n, price_total) ~ priceRange, ., family = "binomial")),
model = map(model, tidy)
) %>%
unnest(model) %>%
filter(term == "priceRange") %>%
mutate(p.value = p.adjust(p.value)) %>%
arrange(-estimate)
word_mods
## # A tibble: 100 × 7
## word data term estimate std.error statistic p.value
## <chr> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 outdoor <tibble [5 × 4]> priceRange 0.00000325 1.85e-7 17.6 4.37e-67
## 2 custom <tibble [5 × 4]> priceRange 0.00000214 1.47e-7 14.6 3.98e-46
## 3 pool <tibble [5 × 4]> priceRange 0.00000159 1.22e-7 13.0 6.12e-37
## 4 office <tibble [5 × 4]> priceRange 0.00000150 1.46e-7 10.3 6.03e-23
## 5 suite <tibble [5 × 4]> priceRange 0.00000143 1.39e-7 10.3 4.03e-23
## 6 gorgeous <tibble [5 × 4]> priceRange 0.000000975 1.62e-7 6.02 1.19e- 7
## 7 w <tibble [5 × 4]> priceRange 0.000000920 9.05e-8 10.2 2.33e-22
## 8 windows <tibble [5 × 4]> priceRange 0.000000890 1.28e-7 6.95 2.81e-10
## 9 private <tibble [5 × 4]> priceRange 0.000000889 1.15e-7 7.70 1.08e-12
## 10 car <tibble [5 × 4]> priceRange 0.000000778 1.66e-7 4.69 1.52e- 4
## # … with 90 more rows
word_freqs %>%
nest(data = c(priceRange, n, price_total, proportion)) %>%
mutate(
model = map(data, ~ glm(cbind(n, price_total) ~ priceRange, ., family = "binomial"))
)
## # A tibble: 100 × 3
## word data model
## <chr> <list> <list>
## 1 access <tibble [5 × 4]> <glm>
## 2 appliances <tibble [5 × 4]> <glm>
## 3 area <tibble [5 × 4]> <glm>
## 4 austin <tibble [5 × 4]> <glm>
## 5 back <tibble [5 × 4]> <glm>
## 6 backyard <tibble [5 × 4]> <glm>
## 7 bath <tibble [5 × 4]> <glm>
## 8 bathroom <tibble [5 × 4]> <glm>
## 9 bathrooms <tibble [5 × 4]> <glm>
## 10 beautiful <tibble [5 × 4]> <glm>
## # … with 90 more rows
library(ggrepel)
word_mods %>%
ggplot(aes(estimate, p.value)) +
geom_vline(xintercept = 0, lty = 2, alpha = 0.7, color = "gray50") +
geom_point(color = "midnightblue", alpha = 0.8, size = 2.5) +
scale_y_log10() +
geom_text_repel(aes(label = word))
## Warning: ggrepel: 77 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps