Data Analysis with R: Introduction of R Language

Blogs

The Workspace

The workspace is your current R working environment and includes any user-defined objects (vectors, matrices, data frames, lists, functions). At the end of an R session, the user can save an image of the current workspace that is automatically reloaded the next time R is started.

getwd()
## [1] "/media/myaseen/Documents/MYaseen208/Consultancy_at_UAF/Mr._M._Yaseen/2018-04-20_Website/MYaseen208/content/myaseen208"
#setwd("C:/Users/PBS FAISALABAD/Documents/R/Training")#use forward slash instead of back slash in windows.
save.image() # To save workspace image.
x<-c(1:4)
x
## [1] 1 2 3 4
save(x, file = "mydat.RData")
rm(list = ls())
load("mydat.RData")
x
## [1] 1 2 3 4

Data types

x<-2.56
class(x)
## [1] "numeric"
x
## [1] 2.56
y<-as.integer(x)
class(y)
## [1] "integer"
y
## [1] 2
z<-as.integer(2.56)
z
## [1] 2
is.integer(z)
## [1] TRUE
as.integer(is.integer(z))
## [1] 1
as.integer(!is.integer(z))
## [1] 0
as.integer(is.integer(x))
## [1] 0
x<-round(x,digits = 1)
x
## [1] 2.6
y<-"Male"
class(y)
## [1] "character"

Vectors

A vector is a sequence of data elements of the same basic type. A combine function c() is used to form the vector. Here are examples of each type of vector:

x<-c(1,2,3,4)
x
## [1] 1 2 3 4
y<-c("a","b","c","d")
y
## [1] "a" "b" "c" "d"
c(x,y)
## [1] "1" "2" "3" "4" "a" "b" "c" "d"
length(y)
## [1] 4
y[c(2,4)]
## [1] "b" "d"
y[c(2:4)]
## [1] "b" "c" "d"
y[-3]
## [1] "a" "b" "d"
names(x)<-c("first","second","third","fourth")
x
##  first second  third fourth 
##      1      2      3      4
x[c("second","fourth")]
## second fourth 
##      2      4
x[2]<-5
x
##  first second  third fourth 
##      1      5      3      4
x<-x*2
x
##  first second  third fourth 
##      2     10      6      8
seq(from = 1, to = 10, by = 2)
## [1] 1 3 5 7 9
rep(5, time = 10)
##  [1] 5 5 5 5 5 5 5 5 5 5
rep(c("Male","Female"),time = 10)
##  [1] "Male"   "Female" "Male"   "Female" "Male"   "Female" "Male"  
##  [8] "Female" "Male"   "Female" "Male"   "Female" "Male"   "Female"
## [15] "Male"   "Female" "Male"   "Female" "Male"   "Female"
rep(c("Male","Female"), each = 10)
##  [1] "Male"   "Male"   "Male"   "Male"   "Male"   "Male"   "Male"  
##  [8] "Male"   "Male"   "Male"   "Female" "Female" "Female" "Female"
## [15] "Female" "Female" "Female" "Female" "Female" "Female"
rep(seq(from = 2, to = 9, by = .25), time = 5)
##   [1] 2.00 2.25 2.50 2.75 3.00 3.25 3.50 3.75 4.00 4.25 4.50 4.75 5.00 5.25
##  [15] 5.50 5.75 6.00 6.25 6.50 6.75 7.00 7.25 7.50 7.75 8.00 8.25 8.50 8.75
##  [29] 9.00 2.00 2.25 2.50 2.75 3.00 3.25 3.50 3.75 4.00 4.25 4.50 4.75 5.00
##  [43] 5.25 5.50 5.75 6.00 6.25 6.50 6.75 7.00 7.25 7.50 7.75 8.00 8.25 8.50
##  [57] 8.75 9.00 2.00 2.25 2.50 2.75 3.00 3.25 3.50 3.75 4.00 4.25 4.50 4.75
##  [71] 5.00 5.25 5.50 5.75 6.00 6.25 6.50 6.75 7.00 7.25 7.50 7.75 8.00 8.25
##  [85] 8.50 8.75 9.00 2.00 2.25 2.50 2.75 3.00 3.25 3.50 3.75 4.00 4.25 4.50
##  [99] 4.75 5.00 5.25 5.50 5.75 6.00 6.25 6.50 6.75 7.00 7.25 7.50 7.75 8.00
## [113] 8.25 8.50 8.75 9.00 2.00 2.25 2.50 2.75 3.00 3.25 3.50 3.75 4.00 4.25
## [127] 4.50 4.75 5.00 5.25 5.50 5.75 6.00 6.25 6.50 6.75 7.00 7.25 7.50 7.75
## [141] 8.00 8.25 8.50 8.75 9.00

Matrices

A matrix is a collection of data elements arranged in a two dimensional rectangular layout. Matrices are created with the matrix function.

A <- matrix(
  c(1,2,3,4),
  nrow = 2,
  ncol = 2,
  byrow = TRUE)
B <- matrix(
  c(1,2,3,4,5,6,7,8,9),
  nrow = 3,
  ncol = 3,
  byrow = FALSE)
B
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
A
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
A[2,]
## [1] 3 4
A[,2]
## [1] 2 4
A[2,2]
## [1] 4
t(A)
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
det(A)
## [1] -2
diag(A)
## [1] 1 4
B<-A + A
B
##      [,1] [,2]
## [1,]    2    4
## [2,]    6    8
Asquared<- A %*% A
Asquared
##      [,1] [,2]
## [1,]    7   10
## [2,]   15   22
Ainv<- solve(A)
Ainv
##      [,1] [,2]
## [1,] -2.0  1.0
## [2,]  1.5 -0.5
Aev<- eigen(A)
Aev
## eigen() decomposition
## $values
## [1]  5.3722813 -0.3722813
## 
## $vectors
##            [,1]       [,2]
## [1,] -0.4159736 -0.8245648
## [2,] -0.9093767  0.5657675
Aev$values
## [1]  5.3722813 -0.3722813
Aev$vectors
##            [,1]       [,2]
## [1,] -0.4159736 -0.8245648
## [2,] -0.9093767  0.5657675

Lists

Lists are the most complex of the R data types. Basically, a list is an ordered collection of objects (components). A list allows you to gather a variety of (possibly unrelated) objects under one name. For example, a list may contain a combination of vectors, matrices, data frames, and even other lists. A list is created with the list() function.

g<-"My First List"
h<-c(25,26,18,39)
j<-matrix(1:10, nrow = 5)
k<-c("one", "two", "three")
mylist<-list(title = g, ages = h, j,k)
mylist
## $title
## [1] "My First List"
## 
## $ages
## [1] 25 26 18 39
## 
## [[3]]
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
## 
## [[4]]
## [1] "one"   "two"   "three"

Data Frames

A Data Frame is used for storing data tables. It is a list of vectors of equal length. Different columns can contain different type of data (numeric,character, etc.). A data frame is created with the data.frame() function:

df<-data.frame(x,y)
df
##         x y
## first   2 a
## second 10 b
## third   6 c
## fourth  8 d
nrow(df)
## [1] 4
ncol(df)
## [1] 2
df1<-df[df$y %in% c("a","d"),]
df1
##        x y
## first  2 a
## fourth 8 d
df1<-df[df[,2] %in% c("b","c"),]
df1
##         x y
## second 10 b
## third   6 c
library(PakPC2017)
data(PakPC2017Tehsil)
head(PakPC2017Tehsil)
## # A tibble: 6 x 6
##   Province Division   District   Tehsil            Pop1998 Pop2017
##   <chr>    <chr>      <chr>      <chr>               <dbl>   <dbl>
## 1 Punjab   Bahawalpur Bahawalpur Hasilpur           317513  456006
## 2 Punjab   Bahawalpur Bahawalpur Khairpur Tamewali  183903  262628
## 3 Punjab   Bahawalpur Bahawalpur Yazman             168950  262175
## 4 Punjab   Bahawalpur Bahawalpur Ahmadpur East      718297 1078683
## 5 Punjab   Bahawalpur Bahawalpur Bahawalpur City    419542  681696
## 6 Punjab   Bahawalpur Bahawalpur Bahawalpur Saddar  387038  574950

Reading/writing data from/to files (Import/Export Data)

library(readxl)
spi <- read_excel("C:/Users/PBS FAISALABAD/Desktop/spi.xls")
DT::datatable(spi,fillContainer = TRUE, editable = TRUE)
write.csv(PakPC2017Tehsil,"Population.csv")

Connecting to the database

library(dplyr)
con <- DBI::dbConnect(odbc::odbc(),
                      dsn = "mydata")
table13<-tbl(con, "RPI")
table13
#tehsil<-PakPC2017::PakPC2017Tehsil
#copy_to(con,tehsil,"tehsil",
#        temporary = FALSE
#      )
DBI::dbDisconnect(con)

Numerical or Graphical Summaries of data.

library(PakPC2017)
summary(PakPC2017Tehsil)
##    Province           Division           District        
##  Length:543         Length:543         Length:543        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     Tehsil             Pop1998           Pop2017       
##  Length:543         Min.   :   3024   Min.   :   2665  
##  Class :character   1st Qu.:  57502   1st Qu.:  89801  
##  Mode  :character   Median : 157142   Median : 254647  
##                     Mean   : 240553   Mean   : 377084  
##                     3rd Qu.: 298074   3rd Qu.: 447854  
##                     Max.   :2219399   Max.   :4269079  
##                                       NA's   :1
pairs(PakPC2017Tehsil[,5:6])

cor(PakPC2017Tehsil[,5:6], use = "complete")
##           Pop1998   Pop2017
## Pop1998 1.0000000 0.9788354
## Pop2017 0.9788354 1.0000000
popLm<-lm(formula = Pop2017 ~ Pop1998, data = PakPC2017Tehsil)
popLm
## 
## Call:
## lm(formula = Pop2017 ~ Pop1998, data = PakPC2017Tehsil)
## 
## Coefficients:
## (Intercept)      Pop1998  
##   -3445.895        1.579
head(resid(popLm),10)
##          1          2          3          4          5          6 
## -42044.030 -24391.708  -1227.192 -52385.341  22496.260 -32911.295 
##          7          8          9         10 
## -36767.021 -92325.684 -73938.014  10353.258
head(fitted(popLm),10)
##         1         2         3         4         5         6         7 
##  498050.0  287019.7  263402.2 1131068.3  659199.7  607861.3  851910.0 
##         8         9        10 
##  783546.7  599536.0  143090.7
summary(popLm)
## 
## Call:
## lm(formula = Pop2017 ~ Pop1998, data = PakPC2017Tehsil)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -299426  -29128    -355   18117 1071215 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.446e+03  5.391e+03  -0.639    0.523    
## Pop1998      1.579e+00  1.421e-02 111.147   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96960 on 540 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9581, Adjusted R-squared:  0.958 
## F-statistic: 1.235e+04 on 1 and 540 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(popLm)

Summarize Data

library(PakPC2017)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Bar charts and Dot charts

library(PakPC2017)
data("PakPC2017Tehsil")
head(PakPC2017Tehsil)
## # A tibble: 6 x 6
##   Province Division   District   Tehsil            Pop1998 Pop2017
##   <chr>    <chr>      <chr>      <chr>               <dbl>   <dbl>
## 1 Punjab   Bahawalpur Bahawalpur Hasilpur           317513  456006
## 2 Punjab   Bahawalpur Bahawalpur Khairpur Tamewali  183903  262628
## 3 Punjab   Bahawalpur Bahawalpur Yazman             168950  262175
## 4 Punjab   Bahawalpur Bahawalpur Ahmadpur East      718297 1078683
## 5 Punjab   Bahawalpur Bahawalpur Bahawalpur City    419542  681696
## 6 Punjab   Bahawalpur Bahawalpur Bahawalpur Saddar  387038  574950
province<-tapply(PakPC2017Tehsil$Pop2017,PakPC2017Tehsil$Province,sum,na.rm=TRUE)
par(mfrow = c(3,2))
dotchart(province)
## Warning in dotchart(province): 'x' is neither a vector nor a matrix: using
## as.numeric(x)
barplot(province)
boxplot(Pop2017~Province,data = PakPC2017Tehsil)
pie(province, col = rainbow(length(province)))
plot(province, type = "o")
pop<-PakPC2017Tehsil$Pop2017
hist(pop)

hist(pop, breaks = seq(min(pop,na.rm = TRUE),max(pop,na.rm = TRUE), length.out = 31))

ggplot2 package

The mpg Data Frame

A data frame is a rectangular collection of variables (in the columns) and observations (in the rows). mpg contains observations collected by the US Environment Protection Agency on 38 models of cars; To learn more about mpg, open its help page by running ?mpg.

library(ggplot2)
ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy, color = class))

ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy, size = class))
## Warning: Using size for a discrete variable is not advised.

ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy, shape = class))

ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

ggplot(data = mpg)+
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg)+
  geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy, color = drv))+
  geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy))+
  geom_point(mapping = aes(color = drv))+
  geom_smooth(mapping = aes(linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Data Transformation

To explore the basic data manipulation we’ll use PakPC2017::PakPC2017Tehsil. This data frame contains the popualtion of all 543 tehsils of Pakistan. The data comes from Pakistan Bureau of Statistics, Population Census 2017, and is documented in ?PakPc2017Tehsil.

library(PakPC2017)
library(dplyr)
tehsil<-as_tibble(PakPC2017Tehsil)
tehsil
## # A tibble: 543 x 6
##    Province Division   District     Tehsil            Pop1998 Pop2017
##    <chr>    <chr>      <chr>        <chr>               <dbl>   <dbl>
##  1 Punjab   Bahawalpur Bahawalpur   Hasilpur           317513  456006
##  2 Punjab   Bahawalpur Bahawalpur   Khairpur Tamewali  183903  262628
##  3 Punjab   Bahawalpur Bahawalpur   Yazman             168950  262175
##  4 Punjab   Bahawalpur Bahawalpur   Ahmadpur East      718297 1078683
##  5 Punjab   Bahawalpur Bahawalpur   Bahawalpur City    419542  681696
##  6 Punjab   Bahawalpur Bahawalpur   Bahawalpur Saddar  387038  574950
##  7 Punjab   Bahawalpur Bahawalnagar Bahawalnagar       541553  815143
##  8 Punjab   Bahawalpur Bahawalnagar Chishtian          498270  691221
##  9 Punjab   Bahawalpur Bahawalnagar Haroonabad         381767  525598
## 10 Punjab   Bahawalpur Bahawalnagar Fort Abbas          92777  153444
## # ... with 533 more rows

Filter Rows with filter()

filter() allows you to subset observations based on their vaues. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame. For example, we can select all tehsils of Division Faisalabad:

filter(tehsil, Division == "Faisalabad")
## # A tibble: 17 x 6
##    Province Division   District       Tehsil            Pop1998 Pop2017
##    <chr>    <chr>      <chr>          <chr>               <dbl>   <dbl>
##  1 Punjab   Faisalabad Faisalabad     Faisalabad City   2140346 3237961
##  2 Punjab   Faisalabad Faisalabad     Faisalabad Saddar  924110 1465411
##  3 Punjab   Faisalabad Faisalabad     Chak Jhumra        253806  332461
##  4 Punjab   Faisalabad Faisalabad     Sammundri          508637  643068
##  5 Punjab   Faisalabad Faisalabad     Tandlianwala       540802  702733
##  6 Punjab   Faisalabad Faisalabad     Jaranwala         1061846 1492276
##  7 Punjab   Faisalabad Jhang          Jhang              999130 1465472
##  8 Punjab   Faisalabad Jhang          18 Hazari          200036  295801
##  9 Punjab   Faisalabad Jhang          Shorkot            373790  548626
## 10 Punjab   Faisalabad Jhang          Ahmadpur Sial      296465  433517
## 11 Punjab   Faisalabad Chiniot        Chiniot            382876  556147
## 12 Punjab   Faisalabad Chiniot        Lalian             309494  439323
## 13 Punjab   Faisalabad Chiniot        Bhawana            272754  374270
## 14 Punjab   Faisalabad Toba Tek Singh Toba Tek Singh     554231  739826
## 15 Punjab   Faisalabad Toba Tek Singh Gojra              495096  656007
## 16 Punjab   Faisalabad Toba Tek Singh Kamalia            283674  371851
## 17 Punjab   Faisalabad Toba Tek Singh Pir Mahal          288592  422331
newD<-filter(tehsil, grepl("F", tehsil$Division))
newd2<-filter(tehsil, grepl("abad", tehsil$Division))

If you want to save the result, you’ll need to use the assignment operator, <-: fsdDivision<-filter(tehsil, Division == “Faisalabad”)

Comparisons

To use filtering effectively, you have to know how to select the observations that you want using the comparison operators. R provides the standard suite: >, >=, <, <=, != (not equal), and == (equal).

filter(tehsil, Pop2017 >= 1000000)
## # A tibble: 35 x 6
##    Province Division   District       Tehsil            Pop1998 Pop2017
##    <chr>    <chr>      <chr>          <chr>               <dbl>   <dbl>
##  1 Punjab   Bahawalpur Bahawalpur     Ahmadpur East      718297 1078683
##  2 Punjab   Bahawalpur Rahim Yar Khan Rahim Yar Khan     986614 1530330
##  3 Punjab   Bahawalpur Rahim Yar Khan Sadiqabad          771511 1264752
##  4 Punjab   Faisalabad Faisalabad     Faisalabad City   2140346 3237961
##  5 Punjab   Faisalabad Faisalabad     Faisalabad Saddar  924110 1465411
##  6 Punjab   Faisalabad Faisalabad     Jaranwala         1061846 1492276
##  7 Punjab   Faisalabad Jhang          Jhang              999130 1465472
##  8 Punjab   Sahiwal    Sahiwal        Sahiwal           1056132 1491553
##  9 Punjab   Sahiwal    Sahiwal        Chichawatni        787062 1026007
## 10 Punjab   Sahiwal    Okara          Okara              862364 1205655
## # ... with 25 more rows

Logical Operators

Multiple arguments to filter() are combined with “and”: every experessio must be true in order for a row to be included in the output. For other types of combinations, you’ll need to use Boolean operators yourself: & is “and”, | is “or”, and ! is “not”.

filter(tehsil, Pop2017 >= 2000000 | Pop1998 >= 1000000)
## # A tibble: 15 x 6
##    Province           Division   District    Tehsil        Pop1998 Pop2017
##    <chr>              <chr>      <chr>       <chr>           <dbl>   <dbl>
##  1 Punjab             Faisalabad Faisalabad  Faisalabad C… 2140346 3237961
##  2 Punjab             Faisalabad Faisalabad  Jaranwala     1061846 1492276
##  3 Punjab             Sahiwal    Sahiwal     Sahiwal       1056132 1491553
##  4 Punjab             Sahiwal    Okara       Depalpur      1030836 1374912
##  5 Punjab             Multan     Multan      Multan City   1381478 2258570
##  6 Punjab             Sargodha   Sargodha    Sargodha      1081459 1537866
##  7 Punjab             Lahore     Lahore      Lahore City   2219399 3655774
##  8 Punjab             Lahore     Lahore      Model Town    1409228 2698235
##  9 Punjab             Lahore     Lahore      Shalimar      1434823 2280308
## 10 Punjab             Lahore     Sheikhupura Sheikhupura   1049264 1555424
## 11 Punjab             Gujranwala Gujranwala  Gujranwal Sa… 1812137 2807054
## 12 Punjab             Gujranwala Gujrat      Gujrat        1093088 1497865
## 13 Punjab             Gujranwala Sialkot     Sialkot       1207744 1794658
## 14 Punjab             Rawalpindi Rawalpindi  Rawalpindi    1927612 3258547
## 15 Khyber Pakhtunkhwa Peshawar   Peshawar    Peshawar      2026851 4269079
filter(tehsil, Pop2017 >= 1000000 & Pop1998 >= 1000000)
## # A tibble: 15 x 6
##    Province           Division   District    Tehsil        Pop1998 Pop2017
##    <chr>              <chr>      <chr>       <chr>           <dbl>   <dbl>
##  1 Punjab             Faisalabad Faisalabad  Faisalabad C… 2140346 3237961
##  2 Punjab             Faisalabad Faisalabad  Jaranwala     1061846 1492276
##  3 Punjab             Sahiwal    Sahiwal     Sahiwal       1056132 1491553
##  4 Punjab             Sahiwal    Okara       Depalpur      1030836 1374912
##  5 Punjab             Multan     Multan      Multan City   1381478 2258570
##  6 Punjab             Sargodha   Sargodha    Sargodha      1081459 1537866
##  7 Punjab             Lahore     Lahore      Lahore City   2219399 3655774
##  8 Punjab             Lahore     Lahore      Model Town    1409228 2698235
##  9 Punjab             Lahore     Lahore      Shalimar      1434823 2280308
## 10 Punjab             Lahore     Sheikhupura Sheikhupura   1049264 1555424
## 11 Punjab             Gujranwala Gujranwala  Gujranwal Sa… 1812137 2807054
## 12 Punjab             Gujranwala Gujrat      Gujrat        1093088 1497865
## 13 Punjab             Gujranwala Sialkot     Sialkot       1207744 1794658
## 14 Punjab             Rawalpindi Rawalpindi  Rawalpindi    1927612 3258547
## 15 Khyber Pakhtunkhwa Peshawar   Peshawar    Peshawar      2026851 4269079
filter(tehsil, Pop2017 >= 2000000 & !Pop1998 >= 1000000)
## # A tibble: 0 x 6
## # ... with 6 variables: Province <chr>, Division <chr>, District <chr>,
## #   Tehsil <chr>, Pop1998 <dbl>, Pop2017 <dbl>

Missing Values

One important feature of R that can make comparison tricky is missing values, or NAs (“not available”). NA represents an unknown value so missing values are “contagious”; almost any operation involving an unknown value will also be unknown: is.na() is used to determine the missing value.

NA + 5
## [1] NA
NA/2
## [1] NA
NA*5
## [1] NA
x<-c(NA,5,10)
is.na(x)
## [1]  TRUE FALSE FALSE

Arrange Rows with arrange()

The vale of x is NA arrange() works similarly to filter() except that instead of selecting rows, it changes their order. It takes a data frame and a set of column names to order by.

arrange(tehsil, Pop2017)
## # A tibble: 543 x 6
##    Province    Division  District      Tehsil            Pop1998 Pop2017
##    <chr>       <chr>     <chr>         <chr>               <dbl>   <dbl>
##  1 Balochistan Sibi      Sibi          Sangan               3024    2665
##  2 Fata        Fata      Bajaur Agency Bar Chamer Kand      3247    2868
##  3 Balochistan Zhob      Zhob          Kashatoo             5165    5180
##  4 Balochistan Kalat     Kalat         Gazg                 3979    5726
##  5 Sindh       Karachi   Karachi West  Manora Cantonment   10008    5874
##  6 Balochistan Sibi      Sibi          Kotmandai            8057    7587
##  7 Balochistan Nasirabad Jhal Magsi    Mirpur              11562    9444
##  8 Balochistan Sibi      Harnai        Shahrig             17170    9454
##  9 Sindh       Hyderabad Sujawal       Kharo Chan Taluka    9956   10235
## 10 Balochistan Sibi      Dera Bugti    Sangsillah          13700   10265
## # ... with 533 more rows

Use desc() to reorder by a column in descending order.

Select Columns with select()

It’s not uncommon to get datasets with hundreds or even thousands of variables. In this case, the first challenge is often narrowing in on the variables you’re actually interested in. select() allows you to rapidly zoom in on a useful subset using operations based on the names of the variables.

select(tehsil,Province, Tehsil, Pop2017)
## # A tibble: 543 x 3
##    Province Tehsil            Pop2017
##    <chr>    <chr>               <dbl>
##  1 Punjab   Hasilpur           456006
##  2 Punjab   Khairpur Tamewali  262628
##  3 Punjab   Yazman             262175
##  4 Punjab   Ahmadpur East     1078683
##  5 Punjab   Bahawalpur City    681696
##  6 Punjab   Bahawalpur Saddar  574950
##  7 Punjab   Bahawalnagar       815143
##  8 Punjab   Chishtian          691221
##  9 Punjab   Haroonabad         525598
## 10 Punjab   Fort Abbas         153444
## # ... with 533 more rows
select(tehsil,-Province, -Tehsil, -Pop2017)
## # A tibble: 543 x 3
##    Division   District     Pop1998
##    <chr>      <chr>          <dbl>
##  1 Bahawalpur Bahawalpur    317513
##  2 Bahawalpur Bahawalpur    183903
##  3 Bahawalpur Bahawalpur    168950
##  4 Bahawalpur Bahawalpur    718297
##  5 Bahawalpur Bahawalpur    419542
##  6 Bahawalpur Bahawalpur    387038
##  7 Bahawalpur Bahawalnagar  541553
##  8 Bahawalpur Bahawalnagar  498270
##  9 Bahawalpur Bahawalnagar  381767
## 10 Bahawalpur Bahawalnagar   92777
## # ... with 533 more rows
select(tehsil,Pop2017, everything())
## # A tibble: 543 x 6
##    Pop2017 Province Division   District     Tehsil            Pop1998
##      <dbl> <chr>    <chr>      <chr>        <chr>               <dbl>
##  1  456006 Punjab   Bahawalpur Bahawalpur   Hasilpur           317513
##  2  262628 Punjab   Bahawalpur Bahawalpur   Khairpur Tamewali  183903
##  3  262175 Punjab   Bahawalpur Bahawalpur   Yazman             168950
##  4 1078683 Punjab   Bahawalpur Bahawalpur   Ahmadpur East      718297
##  5  681696 Punjab   Bahawalpur Bahawalpur   Bahawalpur City    419542
##  6  574950 Punjab   Bahawalpur Bahawalpur   Bahawalpur Saddar  387038
##  7  815143 Punjab   Bahawalpur Bahawalnagar Bahawalnagar       541553
##  8  691221 Punjab   Bahawalpur Bahawalnagar Chishtian          498270
##  9  525598 Punjab   Bahawalpur Bahawalnagar Haroonabad         381767
## 10  153444 Punjab   Bahawalpur Bahawalnagar Fort Abbas          92777
## # ... with 533 more rows

Add New Variables with mutate()

Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. That’s the job of mutate(). mutate() always adds new columns at the end of your dataset. When you’re in RStudio use View() to see all columns.

dplyr::mutate(tehsil, diff = Pop2017-Pop1998)
## # A tibble: 543 x 7
##    Province Division   District     Tehsil          Pop1998 Pop2017   diff
##    <chr>    <chr>      <chr>        <chr>             <dbl>   <dbl>  <dbl>
##  1 Punjab   Bahawalpur Bahawalpur   Hasilpur         317513  456006 138493
##  2 Punjab   Bahawalpur Bahawalpur   Khairpur Tamew…  183903  262628  78725
##  3 Punjab   Bahawalpur Bahawalpur   Yazman           168950  262175  93225
##  4 Punjab   Bahawalpur Bahawalpur   Ahmadpur East    718297 1078683 360386
##  5 Punjab   Bahawalpur Bahawalpur   Bahawalpur City  419542  681696 262154
##  6 Punjab   Bahawalpur Bahawalpur   Bahawalpur Sad…  387038  574950 187912
##  7 Punjab   Bahawalpur Bahawalnagar Bahawalnagar     541553  815143 273590
##  8 Punjab   Bahawalpur Bahawalnagar Chishtian        498270  691221 192951
##  9 Punjab   Bahawalpur Bahawalnagar Haroonabad       381767  525598 143831
## 10 Punjab   Bahawalpur Bahawalnagar Fort Abbas        92777  153444  60667
## # ... with 533 more rows
dplyr::mutate(tehsil, diff = Pop2017-Pop1998, add = Pop2017+Pop1998)
## # A tibble: 543 x 8
##    Province Division   District     Tehsil   Pop1998 Pop2017   diff    add
##    <chr>    <chr>      <chr>        <chr>      <dbl>   <dbl>  <dbl>  <dbl>
##  1 Punjab   Bahawalpur Bahawalpur   Hasilpur  317513  456006 138493 7.74e5
##  2 Punjab   Bahawalpur Bahawalpur   Khairpu…  183903  262628  78725 4.47e5
##  3 Punjab   Bahawalpur Bahawalpur   Yazman    168950  262175  93225 4.31e5
##  4 Punjab   Bahawalpur Bahawalpur   Ahmadpu…  718297 1078683 360386 1.80e6
##  5 Punjab   Bahawalpur Bahawalpur   Bahawal…  419542  681696 262154 1.10e6
##  6 Punjab   Bahawalpur Bahawalpur   Bahawal…  387038  574950 187912 9.62e5
##  7 Punjab   Bahawalpur Bahawalnagar Bahawal…  541553  815143 273590 1.36e6
##  8 Punjab   Bahawalpur Bahawalnagar Chishti…  498270  691221 192951 1.19e6
##  9 Punjab   Bahawalpur Bahawalnagar Haroona…  381767  525598 143831 9.07e5
## 10 Punjab   Bahawalpur Bahawalnagar Fort Ab…   92777  153444  60667 2.46e5
## # ... with 533 more rows
newData<-dplyr::mutate(tehsil, new = ifelse(Province == "Punjab", 1,
                            ifelse(Province == "Sindh",2,3)))
newData
## # A tibble: 543 x 7
##    Province Division   District     Tehsil           Pop1998 Pop2017   new
##    <chr>    <chr>      <chr>        <chr>              <dbl>   <dbl> <dbl>
##  1 Punjab   Bahawalpur Bahawalpur   Hasilpur          317513  456006     1
##  2 Punjab   Bahawalpur Bahawalpur   Khairpur Tamewa…  183903  262628     1
##  3 Punjab   Bahawalpur Bahawalpur   Yazman            168950  262175     1
##  4 Punjab   Bahawalpur Bahawalpur   Ahmadpur East     718297 1078683     1
##  5 Punjab   Bahawalpur Bahawalpur   Bahawalpur City   419542  681696     1
##  6 Punjab   Bahawalpur Bahawalpur   Bahawalpur Sadd…  387038  574950     1
##  7 Punjab   Bahawalpur Bahawalnagar Bahawalnagar      541553  815143     1
##  8 Punjab   Bahawalpur Bahawalnagar Chishtian         498270  691221     1
##  9 Punjab   Bahawalpur Bahawalnagar Haroonabad        381767  525598     1
## 10 Punjab   Bahawalpur Bahawalnagar Fort Abbas         92777  153444     1
## # ... with 533 more rows

PakPC2017Tehsil Data Frame

plotUnit <-
  PakPC2017Tehsil %>%
  dplyr::group_by(Province, Division) %>%
  dplyr::summarize(
    Pop2017 = sum(Pop2017, na.rm = TRUE)
    , Pop1998 = sum(Pop1998, na.rm = TRUE)
  ) %>%
  dplyr::filter(Division %in% c("Faisalabad","Lahore")) %>%
  tidyr::gather(
    key   = "Census"
    , value = "Population"
    , c("Pop2017","Pop1998")
  )

head(plotUnit)
## # A tibble: 4 x 4
## # Groups:   Province [1]
##   Province Division   Census  Population
##   <chr>    <chr>      <chr>        <dbl>
## 1 Punjab   Faisalabad Pop2017   14177081
## 2 Punjab   Lahore     Pop2017   19398081
## 3 Punjab   Faisalabad Pop1998    9885685
## 4 Punjab   Lahore     Pop1998   12015649
g<-ggplot(data = plotUnit, mapping = aes(x = Division, y = Population))
g

g<-ggplot(data = plotUnit, mapping = aes(x = Division, y = Population))+
  geom_col()
g

g<-ggplot(data = plotUnit, mapping = aes(x = Division, y = Population))+
  geom_col()+
  scale_y_continuous(labels= scales::comma, expand =  c(0, 0))
g

g<-ggplot(data = plotUnit, mapping = aes(x = Division, y = Population))+
  geom_bar(stat = "identity", position = position_dodge(width = .9))+
  scale_y_continuous(labels= scales::comma, expand =  c(0, 0))
g

g<-ggplot(data = plotUnit,
          mapping = aes(x = Division, y = Population, fill = Census))+
  geom_bar(stat = "identity", position = position_dodge(width = .9))+
  scale_y_continuous(labels= scales::comma, expand =  c(0, 0))
g

g<-ggplot(data = plotUnit,
          mapping = aes(x = Division, y = Population, fill = Census))+
  geom_bar(stat = "identity", position = position_dodge(width = .9))+
  scale_y_continuous(labels= scales::comma, expand =  c(0, 0))+
  geom_text(aes(label=scales::comma(Population)),
            position=position_dodge(width = 0.9), vjust = 0.8)
g

g<-ggplot(data = plotUnit,
          mapping = aes(x = Division, y = Population, fill = Census))+
  geom_bar(stat = "identity", position = position_dodge(width = .9))+
  scale_y_continuous(labels= scales::comma, expand =  c(0, 0))+
  geom_text(aes(label=scales::comma(Population)),
            position=position_dodge(width = 0.9), vjust = 0.8)+
  facet_wrap(~Province, scales = "free_x")
g

plotUnit <-
  PakPC2017Tehsil %>%
  dplyr::group_by(Province, Division) %>%
  dplyr::summarize(
    Pop2017 = sum(Pop2017, na.rm = TRUE)
    , Pop1998 = sum(Pop1998, na.rm = TRUE)
  ) %>%
  dplyr::filter(Division %in% c("Faisalabad","Lahore", "Karachi", "Quetta")) %>%
  tidyr::gather(
    key   = "Census"
    , value = "Population"
    , -Province, - Division
  )

g<-ggplot(data = plotUnit,
          mapping = aes(x = Division, y = Population, fill = Census))+
  geom_bar(stat = "identity", position = position_dodge(width = .9))+
  scale_y_continuous(labels= scales::comma, expand =  c(0, 0))+
  geom_text(aes(label=scales::comma(Population)),
            position=position_dodge(width = 0.9), vjust = 0.8)+
  facet_wrap(~Province, scales = "free_x")
g

g<-ggplot(data = PakPC2017Pakistan, mapping = aes(x = Province, y = Pop2017))
g+geom_point(aes(color = ResStatus))+theme(legend.position = "top")
## Warning: Removed 20 rows containing missing values (geom_point).

Scatter Plot

ggplot(PakPC2017Tehsil, aes(Pop1998,Pop2017))+geom_point()+geom_smooth(method = lm)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

dygraphs

The dygraphs package is an R interface to the dygraphs JavaScript charting library. It provides rich facilities for charting time-series data in R

library(dygraphs)
dygraph(nhtemp, main = "New Haven Temperatures", ylab = "Temp (F)")
lungDeaths <- cbind(mdeaths, fdeaths)
dygraph(lungDeaths)
dygraph(lungDeaths) %>% dyRangeSelector()
dygraph(lungDeaths) %>%
  dySeries("mdeaths", label = "Male") %>%
  dySeries("fdeaths", label = "Female") %>%
  dyOptions(stackedGraph = TRUE) %>%
  dyRangeSelector(height = 10)
Year<-seq(from = as.Date("1990/1/1"), to = as.Date("2018/1/1"), by = "years")
prod<-rnorm(29,0,1)
timeSeries<-xts::xts(prod,Year)
dygraph(timeSeries)

plotly

library(ggplot2)
library(plotly)
myPlot<-ggplot(data = mpg, mapping = aes(x = displ, y = hwy))+
  geom_point(mapping = aes(color = drv))+
  geom_smooth(mapping = aes(linetype = drv))
ggplotly(myPlot)

datatable

library(readxl)
spi <- read_excel("C:/Users/PBS FAISALABAD/Desktop/spi.xls")
DT::datatable(spi,fillContainer = TRUE, editable = TRUE)
comments powered by Disqus