DISCLAIMER: The images, code snippets...etc presented in this presentation were collected, copied and borrowed from various internet sources, thanks & credit to the creators/owners
The ability to take data—to be able to understand it, to process it, to extract value from it, to visualize it, to communicate it—that’s going to be a hugely important skill in the next decades - Hal Varian
this is actually just the tip of the iceberg
Statistics
Mean, Median, Mode, Standard Deviation, Range, Quartiles, skewness, kurtosis,.. more
#Applying basic statistics
data(iris)
class(iris)
mean(iris$Sepal.Length)
sd(iris$Sepal.Length)
var(iris$Sepal.Length)
min(iris$Sepal.Length)
max(iris$Sepal.Length)
median(iris$Sepal.Length)
range(iris$Sepal.Length)
quantile(iris$Sepal.Length)
sapply(iris[1:4], mean, na.rm=TRUE)
summary(iris)
cor(iris[,1:4])
cov(iris[,1:4])
t.test(iris$Petal.Width[iris$Species=="setosa"],
iris$Petal.Width[iris$Species=="versicolor"])
cor.test(iris$Sepal.Length, iris$Sepal.Width)
aggregate(x=iris[,1:4],by=list(iris$Species),FUN=mean)
library(reshape)
iris.melt <- melt(iris,id='Species')
cast(Species~variable,data=iris.melt,mean,
subset=Species %in% c('setosa','versicolor'),
margins='grand_row')
?reshape
?aggregate
Data Analysis & Visualizations
Summarize, Scatter plot, Histogram, Box plot, Pie chart, Bar plot, ...... more
data(iris)
table.iris = table(iris$Species)
table.iris
pie(table.iris)
hist(iris$Sepal.Length)
boxplot(Petal.Width ~ Species, data = iris)
plot(x=iris$Petal.Length, y=iris$Petal.Width, col=iris$Species)
pairs(iris[1:4], main = "Edgar Anderson's Iris Data", pch = 21,
bg = c("red", "green3", "blue")[unclass(iris$Species)])
head(sales)
cust_id sales_total num_of_orders gender
1 100001 800.64 3 F
2 100002 217.53 3 F
3 100003 74.58 2 M
4 100004 498.60 3 M
5 100005 723.11 4 F
6 100006 69.43 2 F
summary(sales)
cust_id sales_total num_of_orders gender
Min. :100001 Min. : 30.02 Min. : 1.000 F:5035
1st Qu.:102501 1st Qu.: 80.29 1st Qu.: 2.000 M:4965
Median :105001 Median : 151.65 Median : 2.000
Mean :105001 Mean : 249.46 Mean : 2.428
3rd Qu.:107500 3rd Qu.: 295.50 3rd Qu.: 3.000
Max. :110000 Max. :7606.09 Max. :22.000
#console and script
> x = 7
> x + 9
[1] 16
#comments
# COMMENTS ARE SUPER IMPORTANT so we learned about them
#graphics
x = rnorm(1000, mean = 100, sd = 3)
hist(x)
#getting help
# if you know the function name, but not how to use it:
?chisq.test
# if you know what you want to do, but don't know the function name:
??chisquare
#data types
# character vector
> y = c("apple", "apple", "banana", "kiwi", "bear", "strawberry", "strawberry")
> length(y)
[1] 7
# numeric vector
> numbers = rep(3, 99)
> numbers
[1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[39] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[77] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#matrices
> mymatrix = matrix(c(10, 15, 3, 29), nrow = 2, byrow = TRUE)
> mymatrix
[,1] [,2]
[1,] 10 15
[2,] 3 29
> t(mymatrix)
[,1] [,2]
[1,] 10 3
[2,] 15 29
> solve(mymatrix)
[,1] [,2]
[1,] 0.1183673 -0.06122449
[2,] -0.0122449 0.04081633
> mymatrix %*% solve(mymatrix)
[,1] [,2]
[1,] 1 0
[2,] 0 1
> chisq.test(mymatrix)
Pearson's Chi-squared test with Yates' continuity correction
data: mymatrix
X-squared = 5.8385, df = 1, p-value = 0.01568
#data frames (the mother of all R data types)
# set working directory
setwd("~/Documents/R_intro")
# read in a dataset
wages = read.table("wages.csv", sep = ",", header = TRUE)
#exploratory data analysis
> names(wages)
[1] "edlevel" "south" "sex" "workyr" "union" "wage" "age"
[8] "race" "marital"
> class(wages$marital)
[1] "integer"
> table(wages$union)
not union member union member
438 96
> summary(wages$workyr)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 8.00 15.00 17.82 26.00 55.00
> nrow(wages)
[1] 534
> length(which(is.na(wages$sex)))
[1] 0
> linmod = lm(workyr ~ age, data = wages)
> summary(linmod)
hist(wages$wage, xlab = "hourly wage", main = "wages in our dataset", col = "purple")
plot(wages$age, wages$workyr, xlab = "age", ylab="years worked", main = "age vs. years worked")
abline(lm(wages$workyr ~ wages$age), col="red", lwd = 2)
mtcars[mtcars$mpg>=20,c('gear','mpg')]
aggregate(. ~ gear,mtcars[mtcars$mpg>=20,c('gear','mpg')],mean)
subset(aggregate(. ~ gear,mtcars[mtcars$mpg>=20,c('gear','mpg')],mean),mpg>25)
#output
gear mpg
2 4 25.74
3 5 28.20
myData <- data.frame(age=c(10,20,30), weight=c(100,200,300))
myPersonsAge <- data.frame(age=c(20,25,30))
myTrainModel <- lm(weight~age,data = myData)
predict(myTrainModel,myPersonsAge)
Machine Learning Intro
Algorithms:
Machine Learning Algorithms
# Load required packages
library(ggplot2)
library(datasets)
# Load data
data(iris)
# Set seed to make results reproducible
set.seed(20)
# Implement k-means with 3 clusters
iris_cl <- kmeans(iris[, 3:4], 3, nstart = 20)
iris_cl$cluster <- as.factor(iris_cl$cluster)
# Plot points colored by predicted cluster
ggplot(iris, aes(Petal.Length, Petal.Width, color = iris_cl$cluster)) + geom_point()
# Include required packages
library(party)
library(partykit)
# Have a look at the first ten observations of the dataset
print(head(readingSkills))
input.dat <- readingSkills[c(1:105),]
# Grow the decision tree
output.tree <- ctree(
nativeSpeaker ~ age + shoeSize + score,
data = input.dat)
# Plot the results
plot(as.simpleparty(output.tree))
# Load required packages
library(ggplot2)
# Load iris dataset
data(iris)
# Have a look at the first 10 observations of the dataset
head(iris)
# Fit the regression line
fitted_model <- lm(Sepal.Length ~ Petal.Width + Petal.Length, data = iris)
# Get details about the parameters of the selected model
summary(fitted_model)
# Plot the data points along with the regression line
ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
geom_point(alpha = 6/10) +
stat_smooth(method = "lm", fill="blue", colour="grey50", size=0.5, alpha = 0.1)
#svm
library(e1071)
# quick look at the data
plot(iris)
# feature importance
plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species)
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
#split data
s <- sample(150, 100)
col <- c('Petal.Length','Petal.Width','Species')
iris_train <- iris[s,col]
iris_test <- iris[-s,col]
#create model
svmfit <- svm(Species ~ ., data = iris_train, kernel="linear", cost=.1, scale = FALSE)
print(svmfit)
plot(svmfit, iris_train[, col])
tuned <- tune(svm, Species~., data = iris_train, kernel='linear', ranges = list(cost=c(0.001, 0.01,.1, 1.10, 100)))
summary(tuned)
p <- predict(svmfit, iris_test[,col], type='class')
plot(p)
table(p, iris_test[,3])
mean(p==iris_test[,3])
Simple Linear Regression
# Creating Statistical Models
# Load the data
data(iris)
# Peak at data
head(iris)
# Create a scatterplot
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
main = "Iris Petal Length vs. Width",
xlab = "Petal Length (cm)",
ylab = "Petal Width (cm)")
# Create a linear regression model
model <- lm(
formula = Petal.Width ~ Petal.Length,
data = iris)
# Summarize the model
summary(model)
# Draw a regression line on plot
lines(
x = iris$Petal.Length,
y = model$fitted,
col = "red",
lwd = 3)
# Get correlation coefficient
cor(
x = iris$Petal.Length,
y = iris$Petal.Width)
# Predict new values from the model
predict(
object = model,
newdata = data.frame(
Petal.Length = c(2, 5, 7)))
some important algorithms: https://goo.gl/zAyFea
Least squares regression line
# generate normally distributed data for linear regression, make the scatter plot, and draw
# the least squares regression line.
# generate 1000 normally distributed random numbers and plot a histogram.
x <- rnorm(100)
hist(x)
y <- x + rnorm(100)
plot(x, y)
foo <- lm(y ~ x)
abline(coefficients(foo))