Correlation

Correlation/Google Sheets demonstration

library(googlesheets4)
library(dplyr)
library(magrittr)
library(ggplot2)
library(lubridate)

## put sheets into unauthorized mode (this is a public sheet)
gs4_deauth()
sheet <- "https://docs.google.com/spreadsheets/d/17v5sfqfGP9pZodFoo6rYMUWOXtNstKNUDGI89Nnt61o/edit#gid=2033572720"

## read data
df <- read_sheet(sheet) %>%
  set_colnames(c("timestamp", "distance", "time", "hometown")) %>%
  filter(timestamp > "2023-01-01 00:00:00")

## scatterplot of data
ggplot(df, aes(x = distance, y = time)) +
  geom_point() +
  geom_smooth(method = lm)

## density plot of data
ggplot(df) +
  geom_density(aes(x = distance), fill = "red", alpha = 0.5) +
  geom_density(aes(x = time), fill = "yellow", alpha = 0.5)

## correlation
cor(df$distance, df$time, use = "complete.obs")

Exploring correlation

## take a look at the mtcars dataset

## create plot of variables "wt" and "mpg"

## correlation of "wt" and "mpg"

## create a plot of variables "wt" and "qsec"

## correlation of "wt" and "qsec"

## create plot of variables "cyl" and "wt"

## correlation of "cyl" and "wt"

Correlation tests

## correlation test for "wt" and "mpg"

## correlation test for "wt" and "qsec"

## correlation test for "cyl" and "wt"

Modifiable areal units problem (MAUP) demonstration

library(sf)
library(tidycensus)
library(dplyr)
library(DT)

v17 <- load_variables(2017, "acs5", cache = TRUE)
datatable(v17)

county.data <- get_acs(geography = "county",
                       geometry = TRUE,
                       variables = c(education = "B07009_005",
                                     income = "B10010_001",
                                     pop = "B01003_001",
                                     white = "B02001_002",
                                     renter = "B07013_003",
                                     owner = "B07013_002"),
                       output = "wide") %>%
  mutate(perBach = educationE / popE,
         perNonWhite = (popE - whiteE) / popE,
         ownToRent = ownerE / renterE,
         income = incomeE)

## plot
plot(county.data$income, county.data$perBach)

## correlation
cor(county.data$income, county.data$perBach, use = "complete.obs")

state.data <- get_acs(geography = "state",
                       geometry = TRUE,
                       variables = c(education = "B07009_005",
                                     income = "B10010_001",
                                     pop = "B01003_001",
                                     white = "B02001_002",
                                     renter = "B07013_003",
                                     owner = "B07013_002"),
                       output = "wide") %>%
  mutate(perBach = educationE / popE,
         perNonWhite = (popE - whiteE) / popE,
         ownToRent = ownerE / renterE,
         income = incomeE)

## correlation
cor(state.data$incomeE, state.data$perBach, use = "complete.obs")

County level correlation analysis and correlation matrix

Predict correlation among each pair of the following variables:

  • perBach: percentage of population with a bachelor’s degree

  • income: median family income in the past 12 months

  • ownToRent: ratio of owner occupied housing units to renter occupied housing units

  • perNonWhite: percentage of population that is not white alone

Then, using R, you will assess the assumptions of Pearson’s R, create a correlation matrix, and complete a correlation test.

## load packages: readr, GGally, and corplot
library(readr)
library(GGally)
library(corrplot)

## read data
df <- read_csv("https://gitlab.com/mhaffner/data/-/raw/master/demo-data.csv")

## assess assumptions

## create plots

## create correlation matrix

## visualize

## correlation test: pick two variables that you are interested in