# set your working directory
setwd("c:/Users/sshres07/Box/Teaching_2022/2022/Spring/SpringHIA226/LectureSlides/Week2")
# read in the phys dataphys = read.csv("physicalData.csv", header = TRUE)
# Lets get started with basics.
# Use help command on library()
# Create a sequence from 10 to 100first_seq = seq# Calculate mean
# Use the following vector and use trim function on the mean
data_outlier = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, 4000)
mean(data_outlier)
#try trim 0.01 , 0.04, 0.05, 0.1
mean(data_outlier, trim = )
# Can you explain the differences in output?
####################### MATRIX NOTATION #############################################
# get the 3rd observation
phys[3,]
# get the 4th variable (note: variable/column vs observation)
# get value of the 4th variable in the 3rd observation
# Lets combine some of the commands for matrix notation
# Use concatenate c() function
# get a list of observation# 1) 4 to 10phys[,]# 2) 4 and 10phys[c(),]
# Display the variables exercise and age
# Display observations were selfrate is "very good"
############################# ORDER #####################################################
# Order by age
phys[order(phys$age),]
# Order by bmi
############################# Add or delete a case ####################################
# Delete cases where id = 1, 10, 15 , 20
phys[-c(1,10,15,20),]
# Create a new variables which is ldl+hdl/2
# Delete the new variable
################################# Replacing data conditionally ####################
# Replace all BMI > 35 to NA
############################## Subsetting by cases and variables
# Create a subset of data that only includes the age and ldl
# Create a subset of data that includes age and ldl where selfrate is verygood
################################Aggregate ####################################
# create a function that includes mean median min max and sd
myfun = function(x){ c()}
# aggregate ldl by exercise and apply the function
aggregate(, by = list(), FUN=myfun)
############################ Create subsets and then join #####################
# subset_a only includes id, exercise, and age (remove observations c(1,4,10,15,16,20, 34, 55))
subset_a = phys[-c(1,4,10,15,20,34,55), c("id", "exercise", "age")]
# subset_b only includes id, ldl, and hdl (remove observations c(1,4,8,13,73,20, 34, 55))
subset_b = phys[-c(1,4,8,13,73,20,34,55), c("id","hdl", "ldl")]
# By id perform
# Full join/outer join (union)Full = merge(, , by="id",)
# Right outer join on subset_aRight = merge()# Left outer join subset_aLef = merge()# Inner join (intersection)Inner = merge()