Correlation
Correlation/Google Sheets demonstration
library(googlesheets4)
library(dplyr)
library(magrittr)
library(ggplot2)
library(lubridate)
## put sheets into unauthorized mode (this is a public sheet)
gs4_deauth()
sheet <- "https://docs.google.com/spreadsheets/d/17v5sfqfGP9pZodFoo6rYMUWOXtNstKNUDGI89Nnt61o/edit#gid=2033572720"
## read data
df <- read_sheet(sheet) %>%
set_colnames(c("timestamp", "distance", "time", "hometown")) %>%
filter(timestamp > "2023-01-01 00:00:00")
## scatterplot of data
ggplot(df, aes(x = distance, y = time)) +
geom_point() +
geom_smooth(method = lm)
## density plot of data
ggplot(df) +
geom_density(aes(x = distance), fill = "red", alpha = 0.5) +
geom_density(aes(x = time), fill = "yellow", alpha = 0.5)
## correlation
cor(df$distance, df$time, use = "complete.obs")
Exploring correlation
## take a look at the mtcars dataset
## create plot of variables "wt" and "mpg"
## correlation of "wt" and "mpg"
## create a plot of variables "wt" and "qsec"
## correlation of "wt" and "qsec"
## create plot of variables "cyl" and "wt"
## correlation of "cyl" and "wt"
Correlation tests
## correlation test for "wt" and "mpg"
## correlation test for "wt" and "qsec"
## correlation test for "cyl" and "wt"
Modifiable areal units problem (MAUP) demonstration
library(sf)
library(tidycensus)
library(dplyr)
library(DT)
v17 <- load_variables(2017, "acs5", cache = TRUE)
datatable(v17)
county.data <- get_acs(geography = "county",
geometry = TRUE,
variables = c(education = "B07009_005",
income = "B10010_001",
pop = "B01003_001",
white = "B02001_002",
renter = "B07013_003",
owner = "B07013_002"),
output = "wide") %>%
mutate(perBach = educationE / popE,
perNonWhite = (popE - whiteE) / popE,
ownToRent = ownerE / renterE,
income = incomeE)
## plot
plot(county.data$income, county.data$perBach)
## correlation
cor(county.data$income, county.data$perBach, use = "complete.obs")
state.data <- get_acs(geography = "state",
geometry = TRUE,
variables = c(education = "B07009_005",
income = "B10010_001",
pop = "B01003_001",
white = "B02001_002",
renter = "B07013_003",
owner = "B07013_002"),
output = "wide") %>%
mutate(perBach = educationE / popE,
perNonWhite = (popE - whiteE) / popE,
ownToRent = ownerE / renterE,
income = incomeE)
## correlation
cor(state.data$incomeE, state.data$perBach, use = "complete.obs")
County level correlation analysis and correlation matrix
Predict correlation among each pair of the following variables:
perBach: percentage of population with a bachelor’s degree
income: median family income in the past 12 months
ownToRent: ratio of owner occupied housing units to renter occupied housing units
perNonWhite: percentage of population that is not white alone
Then, using R, you will assess the assumptions of Pearson’s R, create a correlation matrix, and complete a correlation test.
## load packages: readr, GGally, and corplot
library(readr)
library(GGally)
library(corrplot)
## read data
df <- read_csv("https://gitlab.com/mhaffner/data/-/raw/master/demo-data.csv")
## assess assumptions
## create plots
## create correlation matrix
## visualize
## correlation test: pick two variables that you are interested in