Manye Dong 2023-09-30
library(tidyverse)
library(readr)
library(readxl)
pols_month =
read_csv("pols-month.csv") |>
janitor::clean_names() |>
separate(mon, sep = "-", into = c("year", "month", "day")) |>
mutate(year = as.numeric(year), month = as.numeric(month),
day = as.numeric(day), month = month.name[month],
president = ifelse(prez_gop==0, "dem", "gop")) |>
select(-c(prez_dem, prez_gop, day))
head(pols_month)
## # A tibble: 6 × 9
## year month gov_gop sen_gop rep_gop gov_dem sen_dem rep_dem president
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1947 January 23 51 253 23 45 198 dem
## 2 1947 February 23 51 253 23 45 198 dem
## 3 1947 March 23 51 253 23 45 198 dem
## 4 1947 April 23 51 253 23 45 198 dem
## 5 1947 May 23 51 253 23 45 198 dem
## 6 1947 June 23 51 253 23 45 198 dem
snp =
read_csv("snp.csv") |>
janitor::clean_names() |>
separate(date, sep = "/", into = c("month", "day", "year")) |>
mutate(year = as.numeric(year), month = as.numeric(month),
day = as.numeric(day), month = month.name[month],
year = ifelse((50<=year & year<100), 1900+year, 2000+year)) |>
select(year, month, close)
head(snp)
## # A tibble: 6 × 3
## year month close
## <dbl> <chr> <dbl>
## 1 2015 July 2080.
## 2 2015 June 2063.
## 3 2015 May 2107.
## 4 2015 April 2086.
## 5 2015 March 2068.
## 6 2015 February 2104.
unemployment =
read_csv("unemployment.csv") |>
janitor::clean_names() |>
pivot_longer(
jan:dec,
names_to = "month",
values_to = "unemployment_percent") |>
mutate(year = as.numeric(year),
month = recode(month,
"jan" = "January",
"feb" = "February",
"mar" = "March",
"apr" = "April",
"may" = "May",
"jun" = "June",
"jul" = "July",
"aug" = "August",
"sep" = "September",
"oct" = "October",
"nov" = "November",
"dec" = "December"))
head(unemployment)
## # A tibble: 6 × 3
## year month unemployment_percent
## <dbl> <chr> <dbl>
## 1 1948 January 3.4
## 2 1948 February 3.8
## 3 1948 March 4
## 4 1948 April 3.9
## 5 1948 May 3.5
## 6 1948 June 3.6
Next, we merge the three datasets:
joined =
left_join(pols_month, snp) |>
left_join(x = _, y = unemployment)
head(joined)
## # A tibble: 6 × 11
## year month gov_gop sen_gop rep_gop gov_dem sen_dem rep_dem president close
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1947 January 23 51 253 23 45 198 dem NA
## 2 1947 February 23 51 253 23 45 198 dem NA
## 3 1947 March 23 51 253 23 45 198 dem NA
## 4 1947 April 23 51 253 23 45 198 dem NA
## 5 1947 May 23 51 253 23 45 198 dem NA
## 6 1947 June 23 51 253 23 45 198 dem NA
## # ℹ 1 more variable: unemployment_percent <dbl>
The pols_month data contains 822 rows and 9 columns. The range of the year goes from 1947 to 2015. Key variables include the number of people in both parties in senate, representatives and government, as well as the party of the president in the corresponding date.
The snp data contains 787 rows and 3 features. The range of the year goes from 1947 to 2015. Key variables include the closing values of the S&P stock index on the associated date.
The unemployement data contains 816 rows and 3 features. The range of year goes from 1948 to 2015. Key variables include the unemployment_percent for the corresponding year.
The resulting dataset contains 822 rows and 11 columns. The range of the year goes from 1947 to 2015. Key variables include all the columns noted above, plus the “Na” values for the close column.
mister_trash =
read_excel("202309 Trash Wheel Collection Data.xlsx",
sheet = "Mr. Trash Wheel", range = "A2:N586") |>
janitor::clean_names() |>
separate(date, sep ="-", into = c("year", "month", "day")) |>
mutate(homes_powered = (weight_tons * 500) / 30,
month = month.abb[as.numeric(month)],
wheel = "mister")
tail(mister_trash)
## # A tibble: 6 × 15
## dumpster year month day weight_tons volume_cubic_yards plastic_bottles
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 579 2023 May 04 3.66 15 2200
## 2 580 2023 May 04 2.62 15 3100
## 3 581 2023 Jun 28 3.56 15 3400
## 4 582 2023 Jun 28 3.79 15 4000
## 5 583 2023 Jun 28 2.28 10 2100
## 6 584 2023 Jun 29 3.9 15 1900
## # ℹ 8 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## # glass_bottles <dbl>, plastic_bags <dbl>, wrappers <dbl>,
## # sports_balls <dbl>, homes_powered <dbl>, wheel <chr>
prof_trash =
read_excel("202309 Trash Wheel Collection Data.xlsx",
sheet = "Professor Trash Wheel", range = "A2:M108") |>
janitor::clean_names() |>
separate(date, sep ="-", into = c("year", "month", "day")) |>
mutate(homes_powered = (weight_tons * 500) / 30,
month = month.abb[as.numeric(month)],
wheel = "professor")
tail(prof_trash)
## # A tibble: 6 × 14
## dumpster year month day weight_tons volume_cubic_yards plastic_bottles
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 101 2022 Dec 29 2.13 15 4200
## 2 102 2023 Feb 15 2.41 15 2900
## 3 103 2023 Apr 10 2.05 15 NA
## 4 104 2023 Apr 20 2.58 15 3400
## 5 105 2023 Jun 16 1.85 15 4100
## 6 106 2023 Jun 29 2.25 15 3900
## # ℹ 7 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## # glass_bottles <dbl>, plastic_bags <dbl>, wrappers <dbl>,
## # homes_powered <dbl>, wheel <chr>
gwynnda_trash =
read_excel("202309 Trash Wheel Collection Data.xlsx",
sheet = "Gwynnda Trash Wheel", range = "A2:L157") |>
janitor::clean_names() |>
separate(date, sep ="-", into = c("year", "month", "day")) |>
mutate(homes_powered = (weight_tons * 500) / 30,
month = month.abb[as.numeric(month)],
wheel = "gwynnda")
tail(gwynnda_trash)
## # A tibble: 6 × 13
## dumpster year month day weight_tons volume_cubic_yards plastic_bottles
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 149 2023 Jun 28 2.68 15 5400
## 2 150 2023 Jun 29 2.74 15 4400
## 3 151 2023 Jun 29 3.12 15 5000
## 4 152 2023 Jun 29 3.12 15 3400
## 5 153 2023 Jun 29 3.45 15 4900
## 6 154 2023 Jun 30 2.88 15 2500
## # ℹ 6 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## # plastic_bags <dbl>, wrappers <dbl>, homes_powered <dbl>, wheel <chr>
Now we combine these three datasets:
trash_wheels =
bind_rows(mister_trash, prof_trash, gwynnda_trash) |>
janitor::clean_names()
tail(trash_wheels)
## # A tibble: 6 × 15
## dumpster year month day weight_tons volume_cubic_yards plastic_bottles
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 149 2023 Jun 28 2.68 15 5400
## 2 150 2023 Jun 29 2.74 15 4400
## 3 151 2023 Jun 29 3.12 15 5000
## 4 152 2023 Jun 29 3.12 15 3400
## 5 153 2023 Jun 29 3.45 15 4900
## 6 154 2023 Jun 30 2.88 15 2500
## # ℹ 8 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## # glass_bottles <dbl>, plastic_bags <dbl>, wrappers <dbl>,
## # sports_balls <dbl>, homes_powered <dbl>, wheel <chr>
-
There are 845 rows/observations in the resulting dataset.
-
Key variables include dumpster id, year-month-day as the time of the record for the trash collections, and the type of trash collected by each dumpster.
-
The total weight of trash collected by Professor Trash Wheel is 216.26 in tons.
# filter the conditions for trash
filtered_gywnnda =
gwynnda_trash |>
select(year, month, cigarette_butts) |>
filter(year=="2021", month=="Jul")
head(filtered_gywnnda)
- The total number of cigarette butts collected by Gwynnda in July of 2021 is 1.63^{4} (16300).
MCI_baseline = read_csv("MCI_baseline.csv", skip=1)
MCI_cleaned =
MCI_baseline |>
janitor::clean_names() |>
mutate(sex = ifelse(sex==0, "Female", "Male"),
apoe4 = ifelse(apoe4==0, "Non-carrier", "Carrier"),
age_at_onset = ifelse(age_at_onset==".", 0, age_at_onset),
age_at_onset = as.numeric(age_at_onset))
MCI_cleaned
## # A tibble: 483 × 6
## id current_age sex education apoe4 age_at_onset
## <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1 63.1 Female 16 Carrier 0
## 2 2 65.6 Female 20 Carrier 0
## 3 3 62.5 Male 16 Carrier 66.8
## 4 4 69.8 Female 16 Non-carrier 0
## 5 5 66 Male 16 Non-carrier 68.7
## 6 6 62.5 Male 16 Non-carrier 0
## 7 7 66.5 Male 18 Non-carrier 74
## 8 8 67.2 Female 18 Non-carrier 0
## 9 9 66.7 Female 16 Non-carrier 0
## 10 10 64.1 Female 18 Non-carrier 0
## # ℹ 473 more rows
MCI_predate = MCI_cleaned |>
filter(current_age >= age_at_onset & age_at_onset != 0)
MCI_predate
## # A tibble: 4 × 6
## id current_age sex education apoe4 age_at_onset
## <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 72 62.9 Male 16 Carrier 62.5
## 2 234 66.7 Male 18 Non-carrier 66.7
## 3 283 69 Male 16 Non-carrier 68.3
## 4 380 70.3 Female 16 Non-carrier 70.2
As seen in the above table, we exclude those whose onset age is smaller than their current age:
MCI_cleaned = MCI_cleaned |>
filter(id != 72) |>
filter(id != 234) |>
filter(id != 283) |>
filter(id != 380)
MCI_cleaned
## # A tibble: 479 × 6
## id current_age sex education apoe4 age_at_onset
## <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1 63.1 Female 16 Carrier 0
## 2 2 65.6 Female 20 Carrier 0
## 3 3 62.5 Male 16 Carrier 66.8
## 4 4 69.8 Female 16 Non-carrier 0
## 5 5 66 Male 16 Non-carrier 68.7
## 6 6 62.5 Male 16 Non-carrier 0
## 7 7 66.5 Male 18 Non-carrier 74
## 8 8 67.2 Female 18 Non-carrier 0
## 9 9 66.7 Female 16 Non-carrier 0
## 10 10 64.1 Female 18 Non-carrier 0
## # ℹ 469 more rows
-
I used “skip=1” parameter when importing data to exclude the first row of the because it will mess up with the column names one row below. I also used filter function to exclude those whose onset age is before this study.
-
Relevant features of the dataset include current age of participants, their gender, education in years, and the age at onset of MCI in decimal form after data cleaning.
-
There were originally 479 participants recruited (already excluding those who developed MCI before the study), of these 93 develop MCI the follow-up period.
-
The average baseline age is 65.0286013.
# Check the proportion of women who are APOE4 carriers
MCI_women =
MCI_cleaned |>
filter(sex=="Female")
MCI_women_carrier =
MCI_women |>
filter(apoe4=="Carrier")
- 30% of women in the study are APOE4 carriers.
amyloid =
read_csv("mci_amyloid.csv", skip = 1) |>
janitor::clean_names() |>
pivot_longer(
baseline:time_8,
names_to = "time",
values_to = "amyloid") |>
mutate(amyloid = as.numeric(amyloid))
head(amyloid)
## # A tibble: 6 × 3
## study_id time amyloid
## <dbl> <chr> <dbl>
## 1 1 baseline 0.111
## 2 1 time_2 NA
## 3 1 time_4 0.109
## 4 1 time_6 0.105
## 5 1 time_8 0.107
## 6 2 baseline 0.107
Here, I also used “skip=1” parameter to exclude the first row because it is not the column name we want. Key features include
Now we check whether some participants appear in only the baseline or amyloid datasets:
Let’s first check how many are only in the MCI_cleaned/baseline dataset:
check_id_mci = anti_join(MCI_cleaned, amyloid, by = c("id" = "study_id"))
check_id_mci
## # A tibble: 8 × 6
## id current_age sex education apoe4 age_at_onset
## <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 14 58.4 Female 20 Non-carrier 66.2
## 2 49 64.7 Male 16 Non-carrier 68.4
## 3 92 68.6 Female 20 Non-carrier 0
## 4 179 68.1 Male 16 Non-carrier 0
## 5 268 61.4 Female 18 Carrier 67.5
## 6 304 63.8 Female 16 Non-carrier 0
## 7 389 59.3 Female 16 Non-carrier 0
## 8 412 67 Male 16 Carrier 0
There are 8 participants only in the MCI_cleaned/baseline dataset.
Then let’s see how many participants only appear in the amyloid:
check_id_amyloid = anti_join(amyloid, MCI_cleaned, by = c("study_id"="id"))
head(check_id_amyloid)
## # A tibble: 6 × 3
## study_id time amyloid
## <dbl> <chr> <dbl>
## 1 72 baseline 0.107
## 2 72 time_2 NA
## 3 72 time_4 0.107
## 4 72 time_6 0.107
## 5 72 time_8 NA
## 6 234 baseline 0.111
There are 80 participants only in the amyloid dataset. So, yes, there are many participants who appear in only the amyloid dataset, and a lot only appear only in the baseline dataset.
Now, we combine the demographic and biomarker datasets so that only participants who appear in both datasets are retained:
MCI_df = inner_join(MCI_cleaned, amyloid, by = c("id" = "study_id"))
head(MCI_df)
## # A tibble: 6 × 8
## id current_age sex education apoe4 age_at_onset time amyloid
## <dbl> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1 63.1 Female 16 Carrier 0 baseline 0.111
## 2 1 63.1 Female 16 Carrier 0 time_2 NA
## 3 1 63.1 Female 16 Carrier 0 time_4 0.109
## 4 1 63.1 Female 16 Carrier 0 time_6 0.105
## 5 1 63.1 Female 16 Carrier 0 time_8 0.107
## 6 2 65.6 Female 20 Carrier 0 baseline 0.107
The resulting dataset contains 2355 rows/observations and 8 columns. Key features include the study id for each participant, their current age, gender, education level in years, their age at the onset of MCI and the apoe4 indicator. The amyloid column, after cleaning, contains measurements of biomarkers amyloid, from baseline level to the levels after different time intervals across the study.
Finally, we export the result as a CSV to current data directory:
write.csv(MCI_df, file = "MCI_df.csv")