A review on the first few lectures

Quick review of April 13th session

Using R as a calculator:

2*8+3

## [1] 19

2^3*5-3

## [1] 37

a <- 2^3*5-3
a-6

## [1] 31

Using print and paste:

print("It's a good day!")

## [1] "It's a good day!"

print(paste("I am", a, "years old."))

## [1] "I am 37 years old."

print(paste("I am", a, "years old.", sep="-"))

## [1] "I am-37-years old."

Getting help from R:

help(read.table)
?read.table

Creating vectors:

weight.1 <- c(112,106,121,119)
weight.2 <- c(122,136,133,140)
weight.2-weight.1

## [1] 10 30 12 21

Testing, and log variables:

fruits <- c("Apple","Orange","Carrot")
fruits=="Apple"

## [1]  TRUE FALSE FALSE

isApple <- fruits=="Apple"
isApple

## [1]  TRUE FALSE FALSE

Factors are categorical variables. Those are necessary to distinguish between categorical and numerical variables:

pain <- c(0,3,2,2,1)
pain

## [1] 0 3 2 2 1

levels(pain)

## NULL

levels(as.factor(pain))

## [1] "0" "1" "2" "3"

levels(pain) <- c("none","mild","medium","severe")
levels(pain)

## [1] "none"   "mild"   "medium" "severe"

pain

## [1] 0 3 2 2 1
## attr(,"levels")
## [1] "none"   "mild"   "medium" "severe"

Quick review of April 14th session

Sequences:

4:9

## [1] 4 5 6 7 8 9

seq(4,9)

## [1] 4 5 6 7 8 9

seq(4,9,1)

## [1] 4 5 6 7 8 9

seq(4,10,2)

## [1]  4  6  8 10

Replicates:

oops <- c(7,9,13)
rep(oops,3)

## [1]  7  9 13  7  9 13  7  9 13

rep(oops,1:3)

## [1]  7  9  9 13 13 13

rep(c("male","female"),c(4,6))

##  [1] "male"   "male"   "male"   "male"   "female" "female" "female"
##  [8] "female" "female" "female"

rep(c("male","female"),each=5)

##  [1] "male"   "male"   "male"   "male"   "male"   "female" "female"
##  [8] "female" "female" "female"

Lists are useful to combine different collection of objects.

intake.pre <- c(5260,5470,5640,6180,6390,6515,6805,7515,7515,8230,8770)
intake.post <- c(3910,4220,3885,5160,5645,4680,5265,5975,6790,6900,7335)
mylist <- list(before=intake.pre, after=intake.post)
mylist

## $before
##  [1] 5260 5470 5640 6180 6390 6515 6805 7515 7515 8230 8770
## 
## $after
##  [1] 3910 4220 3885 5160 5645 4680 5265 5975 6790 6900 7335

mylist$after

##  [1] 3910 4220 3885 5160 5645 4680 5265 5975 6790 6900 7335

Matrices:

x <- 1:12
dim(x) <- c(3,4)
x

##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12

y <- matrix(1:12,nrow=3,byrow=TRUE)
y

##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    5    6    7    8
## [3,]    9   10   11   12

dim(x)

## [1] 3 4

Data frame is the most useful way to deal with a table-like data. Creating a data frame:

testdata <- data.frame(id=paste('pig',1:9,sep=""),pre.weaning=101:109,post.weaning=209:201)
testdata

##     id pre.weaning post.weaning
## 1 pig1         101          209
## 2 pig2         102          208
## 3 pig3         103          207
## 4 pig4         104          206
## 5 pig5         105          205
## 6 pig6         106          204
## 7 pig7         107          203
## 8 pig8         108          202
## 9 pig9         109          201

testdata$id

## [1] pig1 pig2 pig3 pig4 pig5 pig6 pig7 pig8 pig9
## Levels: pig1 pig2 pig3 pig4 pig5 pig6 pig7 pig8 pig9

testdata$post.weaning

## [1] 209 208 207 206 205 204 203 202 201

testdata$post.weaning[2]

## [1] 208

Slicing:

test <- data.frame(x=1:5,y=6:10,z=11:15)
test

##   x  y  z
## 1 1  6 11
## 2 2  7 12
## 3 3  8 13
## 4 4  9 14
## 5 5 10 15

test[,2]

## [1]  6  7  8  9 10

test[2,]

##   x y  z
## 2 2 7 12

test[2,2]

## [1] 7

test[2,c(1,3)]

##   x  z
## 2 2 12

Indexing:

colnames(test) <- c("xx","yy","zz")
rownames(test) <- c("a","b","c","d","e")
test

##   xx yy zz
## a  1  6 11
## b  2  7 12
## c  3  8 13
## d  4  9 14
## e  5 10 15

test[,"yy"]

## [1]  6  7  8  9 10

test["b",]

##   xx yy zz
## b  2  7 12

test["b","yy"]

## [1] 7

test["b",c("xx","zz")]

##   xx zz
## b  2 12

subset(test, select=xx)

##   xx
## a  1
## b  2
## c  3
## d  4
## e  5

subset(test, select=-xx)

##   yy zz
## a  6 11
## b  7 12
## c  8 13
## d  9 14
## e 10 15

Slicing & indexing:

test[1:3,c("xx","zz")]

##   xx zz
## a  1 11
## b  2 12
## c  3 13

Dropping data:

test

##   xx yy zz
## a  1  6 11
## b  2  7 12
## c  3  8 13
## d  4  9 14
## e  5 10 15

test[,-1]

##   yy zz
## a  6 11
## b  7 12
## c  8 13
## d  9 14
## e 10 15

test[-1,]

##   xx yy zz
## b  2  7 12
## c  3  8 13
## d  4  9 14
## e  5 10 15

subset(test,select=-xx)

##   yy zz
## a  6 11
## b  7 12
## c  8 13
## d  9 14
## e 10 15

Conditional selection:

test

##   xx yy zz
## a  1  6 11
## b  2  7 12
## c  3  8 13
## d  4  9 14
## e  5 10 15

test[test$xx!=3,]

##   xx yy zz
## a  1  6 11
## b  2  7 12
## d  4  9 14
## e  5 10 15

test[test$xx>3,]

##   xx yy zz
## d  4  9 14
## e  5 10 15

test[test$xx>=3 & test$zz<=14,]

##   xx yy zz
## c  3  8 13
## d  4  9 14

test[test$xx>3 | test$zz<12,]

##   xx yy zz
## a  1  6 11
## d  4  9 14
## e  5 10 15

Read data file:

To read a space separated file like data1.txt, with a header row: Depending to the file that you are reading, options can be different.

data1 <- read.table(file.choose(), header=TRUE, colClasses=c("character","character","numeric","numeric"))

Get some information about the file:

colnames(data1)

## [1] "ID"       "proteinl" "birthw"   "weanw"

head(data1)

##      ID proteinl birthw weanw
## 1 anim1     high    6.1  23.1
## 2 anim2     high    4.8  16.2
## 3 anim3     high    5.2  16.1
## 4 anim4     high    5.4  15.3
## 5 anim5     high    4.4  18.6
## 6 anim6     high    6.1  23.7

head(data1,10)

##        ID proteinl birthw weanw
## 1   anim1     high    6.1  23.1
## 2   anim2     high    4.8  16.2
## 3   anim3     high    5.2  16.1
## 4   anim4     high    5.4  15.3
## 5   anim5     high    4.4  18.6
## 6   anim6     high    6.1  23.7
## 7   anim7     high    7.3  21.6
## 8   anim8     high    4.1  17.8
## 9   anim9     high    5.1  23.1
## 10 anim10     high    4.8  18.2

tail(data1)

##          ID proteinl birthw weanw
## 495 anim495      low    5.2  16.0
## 496 anim496      low    3.7  12.0
## 497 anim497      low    5.1  15.0
## 498 anim498      low    5.0  13.0
## 499 anim499      low    4.4  13.9
## 500 anim500      low    6.1  15.9

dim(data1)

## [1] 500   4

ncol(data1)

## [1] 4

nrow(data1)

## [1] 500

summary(data1)

##       ID              proteinl             birthw          weanw      
##  Length:500         Length:500         Min.   :2.500   Min.   :11.20  
##  Class :character   Class :character   1st Qu.:4.300   1st Qu.:14.90  
##  Mode  :character   Mode  :character   Median :5.000   Median :16.60  
##                                        Mean   :4.999   Mean   :17.12  
##                                        3rd Qu.:5.700   3rd Qu.:19.10  
##                                        Max.   :7.500   Max.   :25.00

Quick review of April 15th session

For the solutions of the home work, check the presentation file.

Write data file:

Let say we want to write the first 100 rows of data1 in a space separated file data1small.txt, with header row, no quotes, and no row names: Depending to the file that you want to write, options can be different.

write.table(data1[1:100,], file="data1small.txt", quote=FALSE, row.names=FALSE)
getwd()

You can write notes and outputs to an external file. Consider the following example:

write(file="log.txt", print("This is my log file."))
write(file="log.txt", print(paste("Birth weight average =", mean(data1$birthw))), append=TRUE)
write(file="log.txt", print(paste("Weaning weight average =", mean(data1$weanw))), append=TRUE)

In log.txt, you would have:

This is my log file.
Birth weight average = 4.9986
Weaning weight average = 17.1162"

To view a data in RStudio, either click the data in the Environment window or:

View(data1)

To start editing the data manually:

edit(data1)

You can get many descriptive statistics from your data. For example:

mean(data1$birthw)

## [1] 4.9986

cor(data1$birthw, data1$weanw)

## [1] 0.4209551

Merging 2 data frames (different options are used. Checkout the differences):

df1 <- data.frame(anim=paste0('anim',1:9), food=19:11)
df2 <- data.frame(anim=paste0('anim',2:10),water=22:30)
df1

##    anim food
## 1 anim1   19
## 2 anim2   18
## 3 anim3   17
## 4 anim4   16
## 5 anim5   15
## 6 anim6   14
## 7 anim7   13
## 8 anim8   12
## 9 anim9   11

df2

##     anim water
## 1  anim2    22
## 2  anim3    23
## 3  anim4    24
## 4  anim5    25
## 5  anim6    26
## 6  anim7    27
## 7  anim8    28
## 8  anim9    29
## 9 anim10    30

df3 <- merge(df1, df2, by="anim")
df3

##    anim food water
## 1 anim2   18    22
## 2 anim3   17    23
## 3 anim4   16    24
## 4 anim5   15    25
## 5 anim6   14    26
## 6 anim7   13    27
## 7 anim8   12    28
## 8 anim9   11    29

df3 <- merge(df1, df2, by="anim", all.x=TRUE)
df3

##    anim food water
## 1 anim1   19    NA
## 2 anim2   18    22
## 3 anim3   17    23
## 4 anim4   16    24
## 5 anim5   15    25
## 6 anim6   14    26
## 7 anim7   13    27
## 8 anim8   12    28
## 9 anim9   11    29

df3 <- merge(df1, df2, by="anim", all.y=TRUE)
df3

##     anim food water
## 1  anim2   18    22
## 2  anim3   17    23
## 3  anim4   16    24
## 4  anim5   15    25
## 5  anim6   14    26
## 6  anim7   13    27
## 7  anim8   12    28
## 8  anim9   11    29
## 9 anim10   NA    30

What if the by column has different names in the 2 data sets?!

colnames(df2)[1] <- "animal"
colnames(df1)

## [1] "anim" "food"

colnames(df2)

## [1] "animal" "water"

df3 <- merge(df1, df2, by.x="anim", by.y="animal")
df3

##    anim food water
## 1 anim2   18    22
## 2 anim3   17    23
## 3 anim4   16    24
## 4 anim5   15    25
## 5 anim6   14    26
## 6 anim7   13    27
## 7 anim8   12    28
## 8 anim9   11    29

cbind

To do cbind, the data frames should have the same number of rows.

df1

##    anim food
## 1 anim1   19
## 2 anim2   18
## 3 anim3   17
## 4 anim4   16
## 5 anim5   15
## 6 anim6   14
## 7 anim7   13
## 8 anim8   12
## 9 anim9   11

df2

##   animal water
## 1  anim2    22
## 2  anim3    23
## 3  anim4    24
## 4  anim5    25
## 5  anim6    26
## 6  anim7    27
## 7  anim8    28
## 8  anim9    29
## 9 anim10    30

df3 <- cbind(df1, df2)
df3

##    anim food animal water
## 1 anim1   19  anim2    22
## 2 anim2   18  anim3    23
## 3 anim3   17  anim4    24
## 4 anim4   16  anim5    25
## 5 anim5   15  anim6    26
## 6 anim6   14  anim7    27
## 7 anim7   13  anim8    28
## 8 anim8   12  anim9    29
## 9 anim9   11 anim10    30

rbind

To do rbind, the data frames should have the same number of columns and the same column names.

colnames(df1) <- c("anim","food_water")
colnames(df2) <- c("anim","food_water")
df3 <- rbind(df1, df2)
df3

##      anim food_water
## 1   anim1         19
## 2   anim2         18
## 3   anim3         17
## 4   anim4         16
## 5   anim5         15
## 6   anim6         14
## 7   anim7         13
## 8   anim8         12
## 9   anim9         11
## 10  anim2         22
## 11  anim3         23
## 12  anim4         24
## 13  anim5         25
## 14  anim6         26
## 15  anim7         27
## 16  anim8         28
## 17  anim9         29
## 18 anim10         30

sort

sort is not made for data frames! It is for vectors or a single column in a data frame.

df1 <- data.frame(anim=paste0('anim',1:9), food=19:11)
df2 <- data.frame(anim=paste0('anim',2:10),water=22:30)
df3 <- merge(df1, df2, all.x=TRUE)
df3

##    anim food water
## 1 anim1   19    NA
## 2 anim2   18    22
## 3 anim3   17    23
## 4 anim4   16    24
## 5 anim5   15    25
## 6 anim6   14    26
## 7 anim7   13    27
## 8 anim8   12    28
## 9 anim9   11    29

sort(df3$food)

## [1] 11 12 13 14 15 16 17 18 19

sort(df3$food, decreasing=TRUE)

## [1] 19 18 17 16 15 14 13 12 11

order

Example of the order command:

df3[order(df3$food),]

##    anim food water
## 9 anim9   11    29
## 8 anim8   12    28
## 7 anim7   13    27
## 6 anim6   14    26
## 5 anim5   15    25
## 4 anim4   16    24
## 3 anim3   17    23
## 2 anim2   18    22
## 1 anim1   19    NA

df3[order(-df3$food),]

##    anim food water
## 1 anim1   19    NA
## 2 anim2   18    22
## 3 anim3   17    23
## 4 anim4   16    24
## 5 anim5   15    25
## 6 anim6   14    26
## 7 anim7   13    27
## 8 anim8   12    28
## 9 anim9   11    29

df3$group <- rep(1:3,3)
df3

##    anim food water group
## 1 anim1   19    NA     1
## 2 anim2   18    22     2
## 3 anim3   17    23     3
## 4 anim4   16    24     1
## 5 anim5   15    25     2
## 6 anim6   14    26     3
## 7 anim7   13    27     1
## 8 anim8   12    28     2
## 9 anim9   11    29     3

df3[order(-df3$group, df3$food),]

##    anim food water group
## 9 anim9   11    29     3
## 6 anim6   14    26     3
## 3 anim3   17    23     3
## 8 anim8   12    28     2
## 5 anim5   15    25     2
## 2 anim2   18    22     2
## 7 anim7   13    27     1
## 4 anim4   16    24     1
## 1 anim1   19    NA     1

Matrix operations (not presented, but slides are available for those who are interested.)