library(tidyverse)
# You may need to change the file path (this one is for macs)
mortgage <- read_csv("~/Downloads/county_06065.csv") %>%
filter(
business_or_commercial_purpose == 2,
total_units == 1,
derived_dwelling_category == "Single Family (1-4 Units):Site-Built",
action_taken %in% c(1, 2, 3),
income >= 0,
income <= 800
) %>%
mutate(
approved = if_else(action_taken %in% c(1, 2), 1, 0),
loan_purpose = case_when(
loan_purpose == 1 ~ "Home purchase",
loan_purpose == 2 ~ "Home improvement",
loan_purpose == 31 ~ "Refinance",
loan_purpose == 32 ~ "Cash-out refinance",
loan_purpose == 4 ~ "Other",
.default = NA_character_
),
derived_ethnicity = case_when(
derived_ethnicity == "Hispanic or Latino" ~ "Hispanic",
derived_ethnicity == "Not Hispanic or Latino" ~ "Not Hispanic",
derived_ethnicity == "Joint" ~ "Joint",
.default = NA_character_
),
derived_sex = na_if(derived_sex, "Sex Not Available"),
derived_race = case_when(
derived_race == "White" ~ "White",
derived_race == "Black or African American" ~ "Black",
derived_race == "Asian" ~ "Asian",
derived_race == "Joint" ~ "Joint",
derived_race %in% c(
"American Indian or Alaska Native",
"Native Hawaiian or Other Pacific Islander",
"2 or more minority races"
) ~ "Other",
.default = NA_character_
),
applicant_age = na_if(applicant_age, "8888"),
debt_to_income_ratio = case_when(
debt_to_income_ratio == "<20%" ~ "15",
debt_to_income_ratio == "20%-<30%" ~ "25",
debt_to_income_ratio == "30%-<36%" ~ "33",
debt_to_income_ratio == "50%-60%" ~ "55",
debt_to_income_ratio == ">60%" ~ "65",
debt_to_income_ratio %in% as.character(36:49) ~ debt_to_income_ratio,
.default = NA_character_
),
loan_type = case_when(
loan_type == 1 ~ "Conventional",
loan_type == 2 ~ "FHA",
loan_type == 3 ~ "VA",
loan_type == 4 ~ "USDA/FSA",
.default = NA_character_
),
lien_status = case_when(
lien_status == 1 ~ "First lien",
lien_status == 2 ~ "Subordinate lien",
.default = NA_character_
),
occupancy_type = case_when(
occupancy_type == 1 ~ "Principal residence",
occupancy_type == 2 ~ "Second residence",
occupancy_type == 3 ~ "Investment property",
.default = NA_character_
),
loan_amount = loan_amount / 1000,
property_value = as.numeric(property_value) / 1000,
debt_to_income_ratio = as.numeric(debt_to_income_ratio),
loan_term = as.numeric(loan_term),
interest_rate = as.numeric(interest_rate),
loan_purpose = factor(loan_purpose),
derived_ethnicity = factor(derived_ethnicity),
derived_race = factor(derived_race),
derived_sex = factor(derived_sex),
applicant_age = factor(
applicant_age,
levels = c("<25", "25-34", "35-44", "45-54", "55-64", "65-74", ">74")
),
conforming_loan_limit = factor(conforming_loan_limit),
loan_type = factor(loan_type),
lien_status = factor(lien_status),
occupancy_type = factor(occupancy_type)
) %>%
select(
approved,
income,
debt_to_income_ratio,
race = derived_race,
ethnicity = derived_ethnicity,
sex = derived_sex,
age = applicant_age,
loan_amount,
loan_type,
loan_purpose,
lien_status,
conforming_loan_limit,
property_value,
occupancy_type
) %>%
mutate(
race = fct_relevel(race, "White"),
ethnicity = fct_relevel(ethnicity, "Not Hispanic"),
sex = fct_relevel(sex, "Joint"),
age = fct_relevel(age, "35-44"),
loan_type = fct_relevel(loan_type, "Conventional"),
loan_purpose = fct_relevel(loan_purpose, "Home purchase"),
lien_status = fct_relevel(lien_status, "First lien"),
conforming_loan_limit = fct_relevel(conforming_loan_limit, "C"),
occupancy_type = fct_relevel(occupancy_type, "Principal residence")
)6 Logistic Classification
Setup
Here, I’ve worked on cleaning the mortgage data set up a little more. My changes:
- Took out applications for business or commercial purposes
- Removed any applications for properties with more than 1 dwelling unit
- Focused us on applications for single family dwellings
- Dropped observations with negative income or income more than 800K/year
Part 1: Linear Probability Model
First, let’s consider again the linear probability model:
# Estimate the linear probability model with OLS
lpm <- mortgage %>% lm(approved ~ ., data = .)
# View the tidied results
lpm %>% broom::tidy() %>% arrange(desc(estimate)) %>% view()
# a) Rank the applicants from highest to lowest predicted probability of
# being accepted. What's similar and different about each group?
mortgage %>%
mutate(prediction = predict(lpm, newdata = .)) %>%
select(prediction, everything()) %>%
view()
# b) Take the median applicant and find their predicted probability of
# being accepted. Then change the loan purpose to "Cash-out refinance".
# How much does the prediction change? Does it make sense?
mortgage %>%
summarise(
across(where(is.numeric), \(x) median(x, na.rm = TRUE)),
across(where(is.factor), \(x) {
names(which.max(table(x)))
})
) %>%
add_row(approved = 1, income = 133, debt_to_income_ratio = 45, loan_amount = 315, property_value = 625, race = "White", ethnicity = "Not Hispanic", sex = "Joint", age = "35-44", loan_type = "Conventional", loan_purpose = "____", lien_status = "First lien", conforming_loan_limit = "C", occupancy_type = "Principal residence") %>%
mutate(prediction = predict(lpm, newdata = .)) %>%
select(prediction, everything()) %>%
view()
# c) There's a puzzle here: A plot of income against approved shows
# approvals increase in frequency with income, but the full linear
# model gives income a negative coefficient. Why could this be?
# Hint: Try estimating the model `approved ~ income + loan_purpose +
# race + debt_to_income_ratio`, and then drop one variable at a time.
# What happens to the coefficient on income? Discuss.
mortgage %>%
lm(approved ~ income + loan_purpose + race + debt_to_income_ratio, data = .) %>%
broom::tidy()The linear probability model has one major weakness: because it’s a line of best fit, it may predict someone has a probability of being approved that’s above 1 (or less than 0): this is nonsense.
The logistic classification model won’t have that problem.
Part 2: Logistic Classification Model (Logit)
Now we’ll estimate a logistic classification model and compare it to the linear probability model.
A logistic regression is a linear regression in terms of log odds:
\[\log\left(\frac{p(x)}{1 - p(x)}\right) = \beta_0 + \beta_1 x\]
Take the logit model and solve for \(p(x)\).
Explain why logit predicted probabilities \(p(x)\) will always be bounded by 0 and 1.
Visualizing Logit vs Linear Probability Model
The linear probability model is in red; the logit is in green. In general, a logit will be curved in predicted probability, although it’s a straight line in predicted log odds. The logit is S-shaped (“sigmoid”), bounded by 0 and 1, although we only see the top half of the S shape here.
mortgage %>%
select(approved, income) %>%
ggplot(aes(x = income, y = approved)) +
geom_jitter(height = .4, alpha = .1) +
geom_smooth(method = lm, color = "red") +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = F, color = "green")# This part will mirror the linear probability model analysis
# from the beginning of this assignment, but now with the logit:
# Estimate the logit
logit <- mortgage %>% glm(approved ~ ., data = ., family = "binomial")
# View the tidied results
logit %>%
broom::tidy() %>%
arrange(desc(estimate)) %>%
view()
# c) Rank the applicants from highest to lowest predicted
# probability of being accepted, according to the logit.
# What's similar and different about each group? How
# do these answers differ from the linear probability model?
mortgage %>%
mutate(prediction = predict(logit, newdata = ., type = "response")) %>%
select(prediction, everything()) %>%
view()
# d) Take the median applicant and find their predicted
# probability of being accepted. Then change the loan
# purpose to "Cash-out refinance". How much does the
# prediction change? Does it make sense?
mortgage %>%
summarise(
across(where(is.numeric), \(x) median(x, na.rm = TRUE)),
across(where(is.factor), \(x) {
names(which.max(table(x)))
})
) %>%
add_row(approved = 1, income = 133, debt_to_income_ratio = 45, loan_amount = 315, property_value = 625, race = "White", ethnicity = "Not Hispanic", sex = "Joint", age = "35-44", loan_type = "Conventional", loan_purpose = "___", lien_status = "First lien", conforming_loan_limit = "C", occupancy_type = "Principal residence") %>%
mutate(prediction = predict(logit, newdata = ., type = "response")) %>%
select(prediction, everything()) %>%
view()
# e) What are the differences and similarities between the
# income puzzle in the linear probability model and the income
# puzzle in the logit?
mortgage %>%
glm(approved ~ ., data = ., family = "binomial") %>%
broom::tidy() %>%
filter(term == "income")
mortgage %>%
glm(approved ~ income + loan_purpose + race + debt_to_income_ratio, data = ., family = "binomial") %>%
broom::tidy()
mortgage %>%
glm(approved ~ income, data = ., family = "binomial") %>%
broom::tidy()
mortgage %>%
glm(approved ~ income + loan_purpose + race + debt_to_income_ratio, data = ., family = "binomial") %>%
broom::tidy()Download this assignment
Here’s a link to download this assignment.