Skip to content

Latest commit

 

History

History
478 lines (393 loc) · 17.6 KB

p8105_hw2_md4270.md

File metadata and controls

478 lines (393 loc) · 17.6 KB

p8105_hw2_md4270

Manye Dong 2023-09-30

library(tidyverse)
library(readr)
library(readxl)

Problem 1

Clean pols_month dataset

pols_month = 
  read_csv("pols-month.csv") |>
  janitor::clean_names() |>
  separate(mon, sep = "-", into = c("year", "month", "day")) |>
  mutate(year = as.numeric(year), month = as.numeric(month), 
         day = as.numeric(day), month = month.name[month],
         president = ifelse(prez_gop==0, "dem", "gop")) |>
  select(-c(prez_dem, prez_gop, day))

head(pols_month)
## # A tibble: 6 × 9
##    year month    gov_gop sen_gop rep_gop gov_dem sen_dem rep_dem president
##   <dbl> <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>    
## 1  1947 January       23      51     253      23      45     198 dem      
## 2  1947 February      23      51     253      23      45     198 dem      
## 3  1947 March         23      51     253      23      45     198 dem      
## 4  1947 April         23      51     253      23      45     198 dem      
## 5  1947 May           23      51     253      23      45     198 dem      
## 6  1947 June          23      51     253      23      45     198 dem

Clean snp dataset

snp = 
  read_csv("snp.csv") |>
  janitor::clean_names() |>
  separate(date, sep = "/", into = c("month", "day", "year")) |>
  mutate(year = as.numeric(year), month = as.numeric(month), 
         day = as.numeric(day), month = month.name[month], 
         year = ifelse((50<=year & year<100), 1900+year, 2000+year)) |>
  select(year, month, close)

head(snp)
## # A tibble: 6 × 3
##    year month    close
##   <dbl> <chr>    <dbl>
## 1  2015 July     2080.
## 2  2015 June     2063.
## 3  2015 May      2107.
## 4  2015 April    2086.
## 5  2015 March    2068.
## 6  2015 February 2104.

Clean Unemployment Dataset

unemployment = 
  read_csv("unemployment.csv") |>
  janitor::clean_names() |>
  pivot_longer(
    jan:dec,
    names_to = "month", 
    values_to = "unemployment_percent") |>
  mutate(year = as.numeric(year), 
         month = recode(month, 
                   "jan" = "January", 
                   "feb" = "February", 
                   "mar" = "March", 
                   "apr" = "April", 
                   "may" = "May", 
                   "jun" = "June", 
                   "jul" = "July", 
                   "aug" = "August", 
                   "sep" = "September", 
                   "oct" = "October", 
                   "nov" = "November", 
                   "dec" = "December"))

head(unemployment)
## # A tibble: 6 × 3
##    year month    unemployment_percent
##   <dbl> <chr>                   <dbl>
## 1  1948 January                   3.4
## 2  1948 February                  3.8
## 3  1948 March                     4  
## 4  1948 April                     3.9
## 5  1948 May                       3.5
## 6  1948 June                      3.6

Next, we merge the three datasets:

joined = 
  left_join(pols_month, snp) |>
  left_join(x = _, y = unemployment)

head(joined)
## # A tibble: 6 × 11
##    year month    gov_gop sen_gop rep_gop gov_dem sen_dem rep_dem president close
##   <dbl> <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>     <dbl>
## 1  1947 January       23      51     253      23      45     198 dem          NA
## 2  1947 February      23      51     253      23      45     198 dem          NA
## 3  1947 March         23      51     253      23      45     198 dem          NA
## 4  1947 April         23      51     253      23      45     198 dem          NA
## 5  1947 May           23      51     253      23      45     198 dem          NA
## 6  1947 June          23      51     253      23      45     198 dem          NA
## # ℹ 1 more variable: unemployment_percent <dbl>

The pols_month data contains 822 rows and 9 columns. The range of the year goes from 1947 to 2015. Key variables include the number of people in both parties in senate, representatives and government, as well as the party of the president in the corresponding date.

The snp data contains 787 rows and 3 features. The range of the year goes from 1947 to 2015. Key variables include the closing values of the S&P stock index on the associated date.

The unemployement data contains 816 rows and 3 features. The range of year goes from 1948 to 2015. Key variables include the unemployment_percent for the corresponding year.

The resulting dataset contains 822 rows and 11 columns. The range of the year goes from 1947 to 2015. Key variables include all the columns noted above, plus the “Na” values for the close column.

Problem 2

Clean Mr. Trash Wheel dataset

mister_trash = 
  read_excel("202309 Trash Wheel Collection Data.xlsx", 
             sheet = "Mr. Trash Wheel", range = "A2:N586") |>
  janitor::clean_names() |>
  separate(date, sep ="-", into = c("year", "month", "day")) |>
  mutate(homes_powered = (weight_tons * 500) / 30, 
         month = month.abb[as.numeric(month)],
         wheel = "mister")

tail(mister_trash)
## # A tibble: 6 × 15
##   dumpster year  month day   weight_tons volume_cubic_yards plastic_bottles
##      <dbl> <chr> <chr> <chr>       <dbl>              <dbl>           <dbl>
## 1      579 2023  May   04           3.66                 15            2200
## 2      580 2023  May   04           2.62                 15            3100
## 3      581 2023  Jun   28           3.56                 15            3400
## 4      582 2023  Jun   28           3.79                 15            4000
## 5      583 2023  Jun   28           2.28                 10            2100
## 6      584 2023  Jun   29           3.9                  15            1900
## # ℹ 8 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## #   glass_bottles <dbl>, plastic_bags <dbl>, wrappers <dbl>,
## #   sports_balls <dbl>, homes_powered <dbl>, wheel <chr>

Clean Professor Trash Wheel dataset

prof_trash = 
  read_excel("202309 Trash Wheel Collection Data.xlsx", 
             sheet = "Professor Trash Wheel", range = "A2:M108") |>
  janitor::clean_names() |>
  separate(date, sep ="-", into = c("year", "month", "day")) |>
  mutate(homes_powered = (weight_tons * 500) / 30, 
         month = month.abb[as.numeric(month)],
         wheel = "professor")

tail(prof_trash)
## # A tibble: 6 × 14
##   dumpster year  month day   weight_tons volume_cubic_yards plastic_bottles
##      <dbl> <chr> <chr> <chr>       <dbl>              <dbl>           <dbl>
## 1      101 2022  Dec   29           2.13                 15            4200
## 2      102 2023  Feb   15           2.41                 15            2900
## 3      103 2023  Apr   10           2.05                 15              NA
## 4      104 2023  Apr   20           2.58                 15            3400
## 5      105 2023  Jun   16           1.85                 15            4100
## 6      106 2023  Jun   29           2.25                 15            3900
## # ℹ 7 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## #   glass_bottles <dbl>, plastic_bags <dbl>, wrappers <dbl>,
## #   homes_powered <dbl>, wheel <chr>

Clean Gwynnda Trash dataset

gwynnda_trash = 
  read_excel("202309 Trash Wheel Collection Data.xlsx", 
             sheet = "Gwynnda Trash Wheel", range = "A2:L157") |>
  janitor::clean_names() |>
  separate(date, sep ="-", into = c("year", "month", "day")) |>
  mutate(homes_powered = (weight_tons * 500) / 30, 
         month = month.abb[as.numeric(month)],
         wheel = "gwynnda")

tail(gwynnda_trash)
## # A tibble: 6 × 13
##   dumpster year  month day   weight_tons volume_cubic_yards plastic_bottles
##      <dbl> <chr> <chr> <chr>       <dbl>              <dbl>           <dbl>
## 1      149 2023  Jun   28           2.68                 15            5400
## 2      150 2023  Jun   29           2.74                 15            4400
## 3      151 2023  Jun   29           3.12                 15            5000
## 4      152 2023  Jun   29           3.12                 15            3400
## 5      153 2023  Jun   29           3.45                 15            4900
## 6      154 2023  Jun   30           2.88                 15            2500
## # ℹ 6 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## #   plastic_bags <dbl>, wrappers <dbl>, homes_powered <dbl>, wheel <chr>

Now we combine these three datasets:

trash_wheels =
  bind_rows(mister_trash, prof_trash, gwynnda_trash) |>
  janitor::clean_names()

tail(trash_wheels)
## # A tibble: 6 × 15
##   dumpster year  month day   weight_tons volume_cubic_yards plastic_bottles
##      <dbl> <chr> <chr> <chr>       <dbl>              <dbl>           <dbl>
## 1      149 2023  Jun   28           2.68                 15            5400
## 2      150 2023  Jun   29           2.74                 15            4400
## 3      151 2023  Jun   29           3.12                 15            5000
## 4      152 2023  Jun   29           3.12                 15            3400
## 5      153 2023  Jun   29           3.45                 15            4900
## 6      154 2023  Jun   30           2.88                 15            2500
## # ℹ 8 more variables: polystyrene <dbl>, cigarette_butts <dbl>,
## #   glass_bottles <dbl>, plastic_bags <dbl>, wrappers <dbl>,
## #   sports_balls <dbl>, homes_powered <dbl>, wheel <chr>
  • There are 845 rows/observations in the resulting dataset.

  • Key variables include dumpster id, year-month-day as the time of the record for the trash collections, and the type of trash collected by each dumpster.

  • The total weight of trash collected by Professor Trash Wheel is 216.26 in tons.

# filter the conditions for trash
filtered_gywnnda =  
  gwynnda_trash |>
  select(year, month, cigarette_butts) |>
  filter(year=="2021", month=="Jul")

head(filtered_gywnnda)
  • The total number of cigarette butts collected by Gwynnda in July of 2021 is 1.63^{4} (16300).

Problem 3

Baseline Dataset

MCI_baseline = read_csv("MCI_baseline.csv", skip=1)
MCI_cleaned = 
  MCI_baseline |> 
  janitor::clean_names() |>
  mutate(sex = ifelse(sex==0, "Female", "Male"), 
         apoe4 = ifelse(apoe4==0, "Non-carrier", "Carrier"), 
         age_at_onset = ifelse(age_at_onset==".", 0, age_at_onset), 
         age_at_onset = as.numeric(age_at_onset))
MCI_cleaned
## # A tibble: 483 × 6
##       id current_age sex    education apoe4       age_at_onset
##    <dbl>       <dbl> <chr>      <dbl> <chr>              <dbl>
##  1     1        63.1 Female        16 Carrier              0  
##  2     2        65.6 Female        20 Carrier              0  
##  3     3        62.5 Male          16 Carrier             66.8
##  4     4        69.8 Female        16 Non-carrier          0  
##  5     5        66   Male          16 Non-carrier         68.7
##  6     6        62.5 Male          16 Non-carrier          0  
##  7     7        66.5 Male          18 Non-carrier         74  
##  8     8        67.2 Female        18 Non-carrier          0  
##  9     9        66.7 Female        16 Non-carrier          0  
## 10    10        64.1 Female        18 Non-carrier          0  
## # ℹ 473 more rows
MCI_predate = MCI_cleaned |>
  filter(current_age >= age_at_onset & age_at_onset != 0)
MCI_predate
## # A tibble: 4 × 6
##      id current_age sex    education apoe4       age_at_onset
##   <dbl>       <dbl> <chr>      <dbl> <chr>              <dbl>
## 1    72        62.9 Male          16 Carrier             62.5
## 2   234        66.7 Male          18 Non-carrier         66.7
## 3   283        69   Male          16 Non-carrier         68.3
## 4   380        70.3 Female        16 Non-carrier         70.2

As seen in the above table, we exclude those whose onset age is smaller than their current age:

MCI_cleaned = MCI_cleaned |>
  filter(id != 72) |>
  filter(id != 234) |>
  filter(id != 283) |>
  filter(id != 380)
MCI_cleaned
## # A tibble: 479 × 6
##       id current_age sex    education apoe4       age_at_onset
##    <dbl>       <dbl> <chr>      <dbl> <chr>              <dbl>
##  1     1        63.1 Female        16 Carrier              0  
##  2     2        65.6 Female        20 Carrier              0  
##  3     3        62.5 Male          16 Carrier             66.8
##  4     4        69.8 Female        16 Non-carrier          0  
##  5     5        66   Male          16 Non-carrier         68.7
##  6     6        62.5 Male          16 Non-carrier          0  
##  7     7        66.5 Male          18 Non-carrier         74  
##  8     8        67.2 Female        18 Non-carrier          0  
##  9     9        66.7 Female        16 Non-carrier          0  
## 10    10        64.1 Female        18 Non-carrier          0  
## # ℹ 469 more rows
  • I used “skip=1” parameter when importing data to exclude the first row of the because it will mess up with the column names one row below. I also used filter function to exclude those whose onset age is before this study.

  • Relevant features of the dataset include current age of participants, their gender, education in years, and the age at onset of MCI in decimal form after data cleaning.

  • There were originally 479 participants recruited (already excluding those who developed MCI before the study), of these 93 develop MCI the follow-up period.

  • The average baseline age is 65.0286013.

# Check the proportion of women who are APOE4 carriers
MCI_women = 
  MCI_cleaned |>
  filter(sex=="Female")

MCI_women_carrier = 
  MCI_women |>
  filter(apoe4=="Carrier")
  • 30% of women in the study are APOE4 carriers.

Longitudinal Study

amyloid =
  read_csv("mci_amyloid.csv", skip = 1) |>
  janitor::clean_names() |>
  pivot_longer(
    baseline:time_8,
    names_to = "time", 
    values_to = "amyloid") |>
  mutate(amyloid = as.numeric(amyloid))

head(amyloid)
## # A tibble: 6 × 3
##   study_id time     amyloid
##      <dbl> <chr>      <dbl>
## 1        1 baseline   0.111
## 2        1 time_2    NA    
## 3        1 time_4     0.109
## 4        1 time_6     0.105
## 5        1 time_8     0.107
## 6        2 baseline   0.107

Here, I also used “skip=1” parameter to exclude the first row because it is not the column name we want. Key features include

Now we check whether some participants appear in only the baseline or amyloid datasets:

Let’s first check how many are only in the MCI_cleaned/baseline dataset:

check_id_mci = anti_join(MCI_cleaned, amyloid, by = c("id" = "study_id"))

check_id_mci
## # A tibble: 8 × 6
##      id current_age sex    education apoe4       age_at_onset
##   <dbl>       <dbl> <chr>      <dbl> <chr>              <dbl>
## 1    14        58.4 Female        20 Non-carrier         66.2
## 2    49        64.7 Male          16 Non-carrier         68.4
## 3    92        68.6 Female        20 Non-carrier          0  
## 4   179        68.1 Male          16 Non-carrier          0  
## 5   268        61.4 Female        18 Carrier             67.5
## 6   304        63.8 Female        16 Non-carrier          0  
## 7   389        59.3 Female        16 Non-carrier          0  
## 8   412        67   Male          16 Carrier              0

There are 8 participants only in the MCI_cleaned/baseline dataset.

Then let’s see how many participants only appear in the amyloid:

check_id_amyloid = anti_join(amyloid, MCI_cleaned, by = c("study_id"="id"))

head(check_id_amyloid)
## # A tibble: 6 × 3
##   study_id time     amyloid
##      <dbl> <chr>      <dbl>
## 1       72 baseline   0.107
## 2       72 time_2    NA    
## 3       72 time_4     0.107
## 4       72 time_6     0.107
## 5       72 time_8    NA    
## 6      234 baseline   0.111

There are 80 participants only in the amyloid dataset. So, yes, there are many participants who appear in only the amyloid dataset, and a lot only appear only in the baseline dataset.

Now, we combine the demographic and biomarker datasets so that only participants who appear in both datasets are retained:

MCI_df = inner_join(MCI_cleaned, amyloid, by = c("id" = "study_id"))

head(MCI_df)
## # A tibble: 6 × 8
##      id current_age sex    education apoe4   age_at_onset time     amyloid
##   <dbl>       <dbl> <chr>      <dbl> <chr>          <dbl> <chr>      <dbl>
## 1     1        63.1 Female        16 Carrier            0 baseline   0.111
## 2     1        63.1 Female        16 Carrier            0 time_2    NA    
## 3     1        63.1 Female        16 Carrier            0 time_4     0.109
## 4     1        63.1 Female        16 Carrier            0 time_6     0.105
## 5     1        63.1 Female        16 Carrier            0 time_8     0.107
## 6     2        65.6 Female        20 Carrier            0 baseline   0.107

The resulting dataset contains 2355 rows/observations and 8 columns. Key features include the study id for each participant, their current age, gender, education level in years, their age at the onset of MCI and the apoe4 indicator. The amyloid column, after cleaning, contains measurements of biomarkers amyloid, from baseline level to the levels after different time intervals across the study.

Finally, we export the result as a CSV to current data directory:

write.csv(MCI_df, file = "MCI_df.csv")