library(tidyverse)
## ─ Attaching packages ──────────────────── tidyverse 1.3.1 ─
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ─ Conflicts ───────────────────── tidyverse_conflicts() ─
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## ─ Attaching packages ─────────────────── tidymodels 0.1.3 ─
## ✓ broom        0.7.9      ✓ rsample      0.1.0 
## ✓ dials        0.0.10     ✓ tune         0.1.6 
## ✓ infer        1.0.0      ✓ workflows    0.2.3 
## ✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
## ✓ parsnip      0.1.7      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.17
## ─ Conflicts ───────────────────── tidymodels_conflicts() ─
## x dplyr::between()   masks data.table::between()
## x scales::discard()  masks purrr::discard()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x recipes::fixed()   masks stringr::fixed()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x yardstick::spec()  masks readr::spec()
## x recipes::step()    masks stats::step()
## x purrr::transpose() masks data.table::transpose()
## • Use tidymodels_prefer() to resolve common conflicts.
library(skimr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidytext)
train_raw <- read_csv("train.csv")
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 10000 Columns: 16
## ─ Column specification ────────────────────────────
## Delimiter: ","
## chr  (4): city, description, homeType, priceRange
## dbl (11): uid, latitude, longitude, garageSpaces, yearBuilt, numOfPatioAndPo...
## lgl  (1): hasSpa
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_raw %>% 
    count(priceRange)
## # A tibble: 5 × 2
##   priceRange        n
##   <chr>         <int>
## 1 0-250000       1249
## 2 250000-350000  2356
## 3 350000-450000  2301
## 4 450000-650000  2275
## 5 650000+        1819
price_plot <-
  train_raw %>%
  mutate(priceRange = parse_number(priceRange)) %>%
  ggplot(aes(longitude, latitude, z = priceRange)) +
  stat_summary_hex(alpha = 0.8, bins = 50) +
  scale_fill_viridis_c() +
  labs(
    fill = "mean",
    title = "Price"
  )

price_plot

library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots
plot_austin <- function(var, title) {
  train_raw %>%
    ggplot(aes(longitude, latitude, z = {{ var }})) +
    stat_summary_hex(alpha = 0.8, bins = 50) +
    scale_fill_viridis_c() +
    labs(
      fill = "mean",
      title = title
    )
}

(price_plot + plot_austin(avgSchoolRating, "School rating")) /
  (plot_austin(yearBuilt, "Year built") + plot_austin(log(lotSizeSqFt), "Lot size (log)"))

library(tidytext)

austin_tidy <-
  train_raw %>%
  mutate(priceRange = parse_number(priceRange) + 100000) %>%
  unnest_tokens(word, description) %>%
  anti_join(get_stopwords())
## Joining, by = "word"
austin_tidy %>%
  count(word, sort = TRUE)
## # A tibble: 17,944 × 2
##    word         n
##    <chr>    <int>
##  1 home     11620
##  2 kitchen   5721
##  3 room      5494
##  4 austin    4918
##  5 new       4772
##  6 large     4771
##  7 2         4585
##  8 bedrooms  4571
##  9 contains  4413
## 10 3         4386
## # … with 17,934 more rows
top_words <-
  austin_tidy %>%
  count(word, sort = TRUE) %>%
  filter(!word %in% as.character(1:5)) %>%
  slice_max(n, n = 100) %>%
  pull(word)


word_freqs <-
  austin_tidy %>%
  count(word, priceRange) %>%
  complete(word, priceRange, fill = list(n = 0)) %>%
  group_by(priceRange) %>%
  mutate(
    price_total = sum(n),
    proportion = n / price_total
  ) %>%
  ungroup() %>%
  filter(word %in% top_words)

word_freqs
## # A tibble: 500 × 5
##    word       priceRange     n price_total proportion
##    <chr>           <dbl> <dbl>       <dbl>      <dbl>
##  1 access         100000   180       56290    0.00320
##  2 access         350000   365      114853    0.00318
##  3 access         450000   322      116678    0.00276
##  4 access         550000   294      125585    0.00234
##  5 access         750000   248      112073    0.00221
##  6 appliances     100000   209       56290    0.00371
##  7 appliances     350000   583      114853    0.00508
##  8 appliances     450000   576      116678    0.00494
##  9 appliances     550000   567      125585    0.00451
## 10 appliances     750000   391      112073    0.00349
## # … with 490 more rows
word_mods <-
  word_freqs %>%
  nest(data = c(priceRange, n, price_total, proportion)) %>%
  mutate(
    model = map(data, ~ glm(cbind(n, price_total) ~ priceRange, ., family = "binomial")),
    model = map(model, tidy)
  ) %>%
  unnest(model) %>%
  filter(term == "priceRange") %>%
  mutate(p.value = p.adjust(p.value)) %>%
  arrange(-estimate)

word_mods
## # A tibble: 100 × 7
##    word     data             term          estimate std.error statistic  p.value
##    <chr>    <list>           <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 outdoor  <tibble [5 × 4]> priceRange 0.00000325    1.85e-7     17.6  4.37e-67
##  2 custom   <tibble [5 × 4]> priceRange 0.00000214    1.47e-7     14.6  3.98e-46
##  3 pool     <tibble [5 × 4]> priceRange 0.00000159    1.22e-7     13.0  6.12e-37
##  4 office   <tibble [5 × 4]> priceRange 0.00000150    1.46e-7     10.3  6.03e-23
##  5 suite    <tibble [5 × 4]> priceRange 0.00000143    1.39e-7     10.3  4.03e-23
##  6 gorgeous <tibble [5 × 4]> priceRange 0.000000975   1.62e-7      6.02 1.19e- 7
##  7 w        <tibble [5 × 4]> priceRange 0.000000920   9.05e-8     10.2  2.33e-22
##  8 windows  <tibble [5 × 4]> priceRange 0.000000890   1.28e-7      6.95 2.81e-10
##  9 private  <tibble [5 × 4]> priceRange 0.000000889   1.15e-7      7.70 1.08e-12
## 10 car      <tibble [5 × 4]> priceRange 0.000000778   1.66e-7      4.69 1.52e- 4
## # … with 90 more rows
word_freqs %>%
  nest(data = c(priceRange, n, price_total, proportion)) %>%
  mutate(
    model = map(data, ~ glm(cbind(n, price_total) ~ priceRange, ., family = "binomial"))
  )
## # A tibble: 100 × 3
##    word       data             model 
##    <chr>      <list>           <list>
##  1 access     <tibble [5 × 4]> <glm> 
##  2 appliances <tibble [5 × 4]> <glm> 
##  3 area       <tibble [5 × 4]> <glm> 
##  4 austin     <tibble [5 × 4]> <glm> 
##  5 back       <tibble [5 × 4]> <glm> 
##  6 backyard   <tibble [5 × 4]> <glm> 
##  7 bath       <tibble [5 × 4]> <glm> 
##  8 bathroom   <tibble [5 × 4]> <glm> 
##  9 bathrooms  <tibble [5 × 4]> <glm> 
## 10 beautiful  <tibble [5 × 4]> <glm> 
## # … with 90 more rows
library(ggrepel)

word_mods %>%
  ggplot(aes(estimate, p.value)) +
  geom_vline(xintercept = 0, lty = 2, alpha = 0.7, color = "gray50") +
  geom_point(color = "midnightblue", alpha = 0.8, size = 2.5) +
  scale_y_log10() +
  geom_text_repel(aes(label = word))
## Warning: ggrepel: 77 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps