6 Logistic Classification

Setup

Here, I’ve worked on cleaning the mortgage data set up a little more. My changes:

Took out applications for business or commercial purposes
Removed any applications for properties with more than 1 dwelling unit
Focused us on applications for single family dwellings
Dropped observations with negative income or income more than 800K/year

library(tidyverse)

# You may need to change the file path (this one is for macs)
mortgage <- read_csv("~/Downloads/county_06065.csv") %>%
  filter(
    business_or_commercial_purpose == 2,
    total_units == 1,
    derived_dwelling_category == "Single Family (1-4 Units):Site-Built",
    action_taken %in% c(1, 2, 3),
    income >= 0,
    income <= 800
  ) %>%
  mutate(
    approved = if_else(action_taken %in% c(1, 2), 1, 0),

    loan_purpose = case_when(
      loan_purpose == 1  ~ "Home purchase",
      loan_purpose == 2  ~ "Home improvement",
      loan_purpose == 31 ~ "Refinance",
      loan_purpose == 32 ~ "Cash-out refinance",
      loan_purpose == 4  ~ "Other",
      .default = NA_character_
    ),

    derived_ethnicity = case_when(
      derived_ethnicity == "Hispanic or Latino"     ~ "Hispanic",
      derived_ethnicity == "Not Hispanic or Latino" ~ "Not Hispanic",
      derived_ethnicity == "Joint"                  ~ "Joint",
      .default = NA_character_
    ),

    derived_sex = na_if(derived_sex, "Sex Not Available"),

    derived_race = case_when(
      derived_race == "White"                      ~ "White",
      derived_race == "Black or African American" ~ "Black",
      derived_race == "Asian"                     ~ "Asian",
      derived_race == "Joint"                     ~ "Joint",
      derived_race %in% c(
        "American Indian or Alaska Native",
        "Native Hawaiian or Other Pacific Islander",
        "2 or more minority races"
      ) ~ "Other",
      .default = NA_character_
    ),

    applicant_age = na_if(applicant_age, "8888"),

    debt_to_income_ratio = case_when(
      debt_to_income_ratio == "<20%" ~ "15",
      debt_to_income_ratio == "20%-<30%" ~ "25",
      debt_to_income_ratio == "30%-<36%" ~ "33",
      debt_to_income_ratio == "50%-60%" ~ "55",
      debt_to_income_ratio == ">60%" ~ "65",
      debt_to_income_ratio %in% as.character(36:49) ~ debt_to_income_ratio,
      .default = NA_character_
    ),

    loan_type = case_when(
      loan_type == 1 ~ "Conventional",
      loan_type == 2 ~ "FHA",
      loan_type == 3 ~ "VA",
      loan_type == 4 ~ "USDA/FSA",
      .default = NA_character_
    ),

    lien_status = case_when(
      lien_status == 1 ~ "First lien",
      lien_status == 2 ~ "Subordinate lien",
      .default = NA_character_
    ),

    occupancy_type = case_when(
      occupancy_type == 1 ~ "Principal residence",
      occupancy_type == 2 ~ "Second residence",
      occupancy_type == 3 ~ "Investment property",
      .default = NA_character_
    ),

    loan_amount = loan_amount / 1000,
    property_value = as.numeric(property_value) / 1000,
    debt_to_income_ratio = as.numeric(debt_to_income_ratio),
    loan_term = as.numeric(loan_term),
    interest_rate = as.numeric(interest_rate),

    loan_purpose = factor(loan_purpose),
    derived_ethnicity = factor(derived_ethnicity),
    derived_race = factor(derived_race),
    derived_sex = factor(derived_sex),
    applicant_age = factor(
      applicant_age,
      levels = c("<25", "25-34", "35-44", "45-54", "55-64", "65-74", ">74")
    ),
    conforming_loan_limit = factor(conforming_loan_limit),
    loan_type = factor(loan_type),
    lien_status = factor(lien_status),
    occupancy_type = factor(occupancy_type)
  ) %>%
  select(
    approved,
    income,
    debt_to_income_ratio,
    race = derived_race,
    ethnicity = derived_ethnicity,
    sex = derived_sex,
    age = applicant_age,
    loan_amount,
    loan_type,
    loan_purpose,
    lien_status,
    conforming_loan_limit,
    property_value,
    occupancy_type
  ) %>%
  mutate(
    race = fct_relevel(race, "White"),
    ethnicity = fct_relevel(ethnicity, "Not Hispanic"),
    sex = fct_relevel(sex, "Joint"),
    age = fct_relevel(age, "35-44"),
    loan_type = fct_relevel(loan_type, "Conventional"),
    loan_purpose = fct_relevel(loan_purpose, "Home purchase"),
    lien_status = fct_relevel(lien_status, "First lien"),
    conforming_loan_limit = fct_relevel(conforming_loan_limit, "C"),
    occupancy_type = fct_relevel(occupancy_type, "Principal residence")
  )

Part 1: Linear Probability Model

First, let’s consider again the linear probability model:

# Estimate the linear probability model with OLS
lpm <- mortgage %>% lm(approved ~ ., data = .)

# View the tidied results
lpm %>% broom::tidy() %>% arrange(desc(estimate)) %>% view()

# a) Rank the applicants from highest to lowest predicted probability of
# being accepted. What's similar and different about each group?

mortgage %>%
  mutate(prediction = predict(lpm, newdata = .)) %>%
  select(prediction, everything()) %>%
  view()


# b) Take the median applicant and find their predicted probability of
# being accepted. Then change the loan purpose to "Cash-out refinance".
# How much does the prediction change? Does it make sense?

mortgage %>%
  summarise(
    across(where(is.numeric), \(x) median(x, na.rm = TRUE)),
    across(where(is.factor), \(x) {
      names(which.max(table(x)))
    })
  ) %>%
  add_row(approved = 1, income = 133, debt_to_income_ratio = 45, loan_amount = 315, property_value = 625, race = "White", ethnicity = "Not Hispanic", sex = "Joint", age = "35-44", loan_type = "Conventional", loan_purpose = "____", lien_status = "First lien", conforming_loan_limit = "C", occupancy_type = "Principal residence") %>%
  mutate(prediction = predict(lpm, newdata = .)) %>%
  select(prediction, everything()) %>%
  view()

# c) There's a puzzle here: A plot of income against approved shows 
# approvals increase in frequency with income, but the full linear 
# model gives income a negative coefficient. Why could this be?
# Hint: Try estimating the model `approved ~ income + loan_purpose +
# race + debt_to_income_ratio`, and then drop one variable at a time.
# What happens to the coefficient on income? Discuss.

mortgage %>%
  lm(approved ~ income + loan_purpose + race + debt_to_income_ratio, data = .) %>%
  broom::tidy()

The linear probability model has one major weakness: because it’s a line of best fit, it may predict someone has a probability of being approved that’s above 1 (or less than 0): this is nonsense.

The logistic classification model won’t have that problem.

Part 2: Logistic Classification Model (Logit)

Link for Pronunciation

Now we’ll estimate a logistic classification model and compare it to the linear probability model.

A logistic regression is a linear regression in terms of log odds:

\[\log\left(\frac{p(x)}{1 - p(x)}\right) = \beta_0 + \beta_1 x\]

Take the logit model and solve for \(p(x)\).
Explain why logit predicted probabilities \(p(x)\) will always be bounded by 0 and 1.

Visualizing Logit vs Linear Probability Model

The linear probability model is in red; the logit is in green. In general, a logit will be curved in predicted probability, although it’s a straight line in predicted log odds. The logit is S-shaped (“sigmoid”), bounded by 0 and 1, although we only see the top half of the S shape here.

mortgage %>%
  select(approved, income) %>%
  ggplot(aes(x = income, y = approved)) +
  geom_jitter(height = .4, alpha = .1) +
  geom_smooth(method = lm, color = "red") +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = F, color = "green")

# This part will mirror the linear probability model analysis
# from the beginning of this assignment, but now with the logit:

# Estimate the logit
logit <- mortgage %>% glm(approved ~ ., data = ., family = "binomial")

# View the tidied results
logit %>%
  broom::tidy() %>%
  arrange(desc(estimate)) %>% 
  view()

# c) Rank the applicants from highest to lowest predicted
# probability of being accepted, according to the logit. 
# What's similar and different about each group? How
# do these answers differ from the linear probability model?

mortgage %>%
  mutate(prediction = predict(logit, newdata = ., type = "response")) %>%
  select(prediction, everything()) %>%
  view()


# d) Take the median applicant and find their predicted 
# probability of being accepted. Then change the loan 
# purpose to "Cash-out refinance". How much does the 
# prediction change? Does it make sense?

mortgage %>%
  summarise(
    across(where(is.numeric), \(x) median(x, na.rm = TRUE)),
    across(where(is.factor), \(x) {
      names(which.max(table(x)))
    })
  ) %>%
  add_row(approved = 1, income = 133, debt_to_income_ratio = 45, loan_amount = 315, property_value = 625, race = "White", ethnicity = "Not Hispanic", sex = "Joint", age = "35-44", loan_type = "Conventional", loan_purpose = "___", lien_status = "First lien", conforming_loan_limit = "C", occupancy_type = "Principal residence") %>%
  mutate(prediction = predict(logit, newdata = ., type = "response")) %>%
  select(prediction, everything()) %>%
  view()

# e) What are the differences and similarities between the
# income puzzle in the linear probability model and the income
# puzzle in the logit?

mortgage %>%
  glm(approved ~ ., data = ., family = "binomial") %>%
  broom::tidy() %>%
  filter(term == "income")

mortgage %>%
  glm(approved ~ income + loan_purpose + race + debt_to_income_ratio, data = ., family = "binomial") %>%
  broom::tidy()

mortgage %>%
  glm(approved ~ income, data = ., family = "binomial") %>%
  broom::tidy()

mortgage %>%
  glm(approved ~ income + loan_purpose + race + debt_to_income_ratio, data = ., family = "binomial") %>%
  broom::tidy()

Download this assignment

Here’s a link to download this assignment.