Getting to know R: more about data types

August 31, 2018
Beginning R Data Structures Sampling Data Management and Visualization

# .libPaths("P:/RLibrary")
options(stringsAsFactors = TRUE, warn = 0)

knitr::opts_chunk$set(echo = TRUE)

Data Structures

Vectors

  • numbers, characters, logical
  • mixed types of data
  • 1 dimension

create this using c() or vector()

vect <- c(letters[1:26]) #  create a vector
vect
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
length(vect) # how many objects are in the vector
## [1] 26

There are a couple of different ways to look at the structure of objects

typeof(vect) # type of data in the data structure
## [1] "character"
class(vect) # get the class of the data
## [1] "character"
dim(vect) # dimensions
## NULL
str(vect) # structure of the data
##  chr [1:26] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" ...

Challenge

  1. create a vector with values 1, 8, 7, 3, 4, 6, 10, 3
  2. find the mean, variance, standard deviation, standard error, and range
chall2 <- c(1, 8, 7, 3, 4, 6, 10, 3)

mean(chall2) # find the mean
## [1] 5.25
var(chall2) # find the variance
## [1] 9.071429
sd(chall2); sqrt(var(chall2)) # standard deviation
## [1] 3.011881
## [1] 3.011881
sd(chall2)/sqrt(length(chall2))# standard error
## [1] 1.064861
range(chall2) # range
## [1]  1 10
min(chall2) ; range(chall2)[1] # min value
## [1] 1
## [1] 1
max(chall2) ; range(chall2)[2] # max
## [1] 10
## [1] 10

Matrix

  • matrices have two dimensions, rows by columns
  • only have one type of data, either character or numeric

mat <- matrix(NA, 5, 4) # create a matrix of NAs that is 5 rows by 4 columns
mat
dim(mat)

2mat <- matrix(0,2,3) # will not work
!mat <- matrix(0,2,3) # will not work
mata-matb <- matrix(0,2,3) # will not work

matTwo <- matrix(0,2,3) # will  work, camel case
mat.Two <- matrix(0,2,3) # will work, but not good coding

c(1,3,7,4,
  2,6,2,1,
  NA,0,0,2)

matChall <- matrix(c(1,3,7,4,2,6,2,1,NA,0,0,2), nrow = 3, ncol = 4, byrow = TRUE)
matChall

dim(matChall)
## Error: <text>:6:2: unexpected symbol
## 5: 
## 6: 2mat
##     ^

Data frames

  • mixed types of data across columns, must be the same within
  • 2 dimensional, rows by columns
datA <- data.frame("first_col" = 1:10,
                   "second_col" =  as.character(letters[1:10]),
                   "third_col" = factor(toupper(letters[1:10])),
                   stringsAsFactors = FALSE)

datA
##    first_col second_col third_col
## 1          1          a         A
## 2          2          b         B
## 3          3          c         C
## 4          4          d         D
## 5          5          e         E
## 6          6          f         F
## 7          7          g         G
## 8          8          h         H
## 9          9          i         I
## 10        10          j         J
str(datA)
## 'data.frame':    10 obs. of  3 variables:
##  $ first_col : int  1 2 3 4 5 6 7 8 9 10
##  $ second_col: chr  "a" "b" "c" "d" ...
##  $ third_col : Factor w/ 10 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9 10
names(datA) # find the column names
## [1] "first_col"  "second_col" "third_col"
names(datA) <- c("firstCol","secondCol,thirdCol")
datA
##    firstCol secondCol,thirdCol NA
## 1         1                  a  A
## 2         2                  b  B
## 3         3                  c  C
## 4         4                  d  D
## 5         5                  e  E
## 6         6                  f  F
## 7         7                  g  G
## 8         8                  h  H
## 9         9                  i  I
## 10       10                  j  J
names(datA)[1] <- c("#firstCol")
datA[,1] # all rows first column
##  [1]  1  2  3  4  5  6  7  8  9 10
datA[2,1]
## [1] 2
datA$`#firstCol` # notice the tick
##  [1]  1  2  3  4  5  6  7  8  9 10
datA$second_col
## NULL
colnames(datA)
## [1] "#firstCol"          "secondCol,thirdCol" NA
rownames(datA)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
rownames(datA) <- letters[10:19]
rownames(datA) <- NULL
str(datA)
## 'data.frame':    10 obs. of  3 variables:
##  $ #firstCol         : int  1 2 3 4 5 6 7 8 9 10
##  $ secondCol,thirdCol: chr  "a" "b" "c" "d" ...
##  $ NA                : Factor w/ 10 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9 10
head(datA)
##   #firstCol secondCol,thirdCol NA
## 1         1                  a  A
## 2         2                  b  B
## 3         3                  c  C
## 4         4                  d  D
## 5         5                  e  E
## 6         6                  f  F
tail(datA)
##    #firstCol secondCol,thirdCol NA
## 5          5                  e  E
## 6          6                  f  F
## 7          7                  g  G
## 8          8                  h  H
## 9          9                  i  I
## 10        10                  j  J

Indexing

datA[,1] # first column all rows
##  [1]  1  2  3  4  5  6  7  8  9 10
datA[2,] # second row all columns
##   #firstCol secondCol,thirdCol NA
## 2         2                  b  B
datA$`#firstCol`[c(1,3,10)]
## [1]  1  3 10
datA[4, "third_col"]
## NULL
datA[,-2]
##    #firstCol NA
## 1          1  A
## 2          2  B
## 3          3  C
## 4          4  D
## 5          5  E
## 6          6  F
## 7          7  G
## 8          8  H
## 9          9  I
## 10        10  J
datA[,-c(2,3)]
##  [1]  1  2  3  4  5  6  7  8  9 10
datA[c(1,1,2,2,3,3),]
##     #firstCol secondCol,thirdCol NA
## 1           1                  a  A
## 1.1         1                  a  A
## 2           2                  b  B
## 2.1         2                  b  B
## 3           3                  c  C
## 3.1         3                  c  C
datA[rep(1:3,2),]
##     #firstCol secondCol,thirdCol NA
## 1           1                  a  A
## 2           2                  b  B
## 3           3                  c  C
## 1.1         1                  a  A
## 2.1         2                  b  B
## 3.1         3                  c  C
datA[rep(1:3,each = 2),]
##     #firstCol secondCol,thirdCol NA
## 1           1                  a  A
## 1.1         1                  a  A
## 2           2                  b  B
## 2.1         2                  b  B
## 3           3                  c  C
## 3.1         3                  c  C
names(datA); colnames(datA)
## [1] "#firstCol"          "secondCol,thirdCol" NA
## [1] "#firstCol"          "secondCol,thirdCol" NA
names(datA) == "second_col"
## [1] FALSE FALSE    NA
which(names(datA) == "second_col")
## integer(0)
datA[which(names(datA) == "second_col")]
## data frame with 0 columns and 10 rows
datA[-which(names(datA) == "second_col")] # use minus to drop column
## data frame with 0 columns and 10 rows
datA[which(names(datA) != "second_col")]
##    #firstCol secondCol,thirdCol
## 1          1                  a
## 2          2                  b
## 3          3                  c
## 4          4                  d
## 5          5                  e
## 6          6                  f
## 7          7                  g
## 8          8                  h
## 9          9                  i
## 10        10                  j
datA[,which(names(datA) == "#firstCol" | names(datA) == "third_col")]
##  [1]  1  2  3  4  5  6  7  8  9 10
datA[,which(names(datA) %in% c("#firstCol","third_col"))]
##  [1]  1  2  3  4  5  6  7  8  9 10
names(datA) == c("#firstCol","third_col")
## Warning in names(datA) == c("#firstCol", "third_col"): longer object length
## is not a multiple of shorter object length
## [1]  TRUE FALSE    NA

Lists

  • flexible data structure
  • often used to store data
  • used with functional programming, purrr
C <- list()
C
## list()
C[[1]]  <- c("a","b","c")
C
## [[1]]
## [1] "a" "b" "c"
C[[2]] <- datA

C[["mod1"]] <- lm(mpg ~ hp, data = mtcars)

Getting to know R: a very brief introduction

August 24, 2018
Beginning R R Studio Sampling Data Management and Visualization
comments powered by Disqus