-An R Notebook is an R Markdown document with chunks that can be executed independently and interactively, with output visible immediately beneath the input.
-Notebook output are available as HTML, PDF, Word, or Latex.
-This Notebook as HTML is preferably open with Google Chrome.
-R-Code can be extracted as Rmd file under the button “Code” in the notebook.
-This Notebook using iterative development. It means the process starts with a simple implementation of a small set of idea requirements and iteratively enhances the evolving versions until the complete version is implemented and perfect.
In depth summarizing
1-With Summary() from Base:
summary(data)
#Grouping with some of category
#example category: age
by(data,data$age, summary)
2-skim(), from the skimr package:
install.packages("skimr")
library(skimr)
# Descriptive statistics
skim(data)
skim(data_excel)
#update
install.packages("dplyr")
library(dplyr)
library(skimr)
group_by(data, category) %>% skim()
3-describe, from the Hmisc package:
install.packages("Hmisc")
library(Hmisc)
Hmisc::describe(data)
describe(data)
4-stat.desc(), from the pastecs package:
install.packages("pastecs")
library(pastecs)
stat.desc(data)
5-describe and describeBy, from the psych package:
install.packages("psych")
library(psych)
psych::describe(data)
describe(data)
psych::describeBy(data, data$type)
#The “mat” parameter does allow you to produce
#a matrix output of the above.
psych::describeBy(data, data$type, mat = TRUE)
6-descr and dfSummary, from the summarytools package:
install.packages("summarytools")
library(summarytools)
summarytools::descr(data)
#Only works with numerical data.
descr(data)
#as data.frame
kable(as.data.frame(summarytools::descr(data)))
summarytools::descr(data)
#transpose
summarytools::descr(data, transpose = TRUE)
#Complete menu from summarytools
dfSummary(data)
7-CreateTableOne, from the tableone package:
install.packages("tableone")
library(tableone)
CreateTableOne(data = data)
summary(CreateTableOne(data = data))
CreateTableOne(strata = "category", data = data)
#For example, if we think “score”
#should not be treated as normal:
print(CreateTableOne(strata = "category", data = data),
nonnormal = "score")
8-desctable, from the desctable package:
install.packages("desctable")
library(desctable)
desctable(data)
group_by(data, category) %>%
desctable()
#This function is super customisable.
desctable(data,stats = list("N" = length, "Mean" = mean,
"SD" = sd, "Min" = min, "Max" = max))
9-ggpairs, from the GGally package:
install.packages("GGally")
library(GGally)
ggpairs(data)
ggpairs(data, mapping = aes(colour = category))
10-ds_summary_stats from descriptr:
install.packages("descriptr")
library(descriptr)
ds_summary_stats(data$score)
ds_screener(data)
ds_multi_stats(filter(data, !is.na(score)), score, rating)
ds_freq_table(data$category)
11-With dlookr: An automated report (as pdf or html):
data<-qry_neue_DL
View(diagnose(data))
a<-data %>%
diagnose() %>%
select(-unique_count, -unique_rate) %>%
filter(missing_count > 0) %>%
arrange(desc(missing_count))
View(a)
View(diagnose_numeric(data))
View(diagnose_category(data))
#No 1. Missing values
diagnose_category(data) %>%
filter(is.na(levels))
#0.01% list levels
data %>%
diagnose_category(top = 500) %>%
filter(ratio <= 0.01)
#Diagnosing outliers with diagnose_outlier()
diagnose_outlier(data)
#Numeric variables that contain anomalies are easily found
#with filter().:
diagnose_outlier(data) %>%
filter(outliers_cnt > 0)
#The following is a list of numeric variables with
#anomalies greater than 5%.:
diagnose_outlier(data) %>%
filter(outliers_ratio > 5) %>%
mutate(rate = outliers_mean / with_mean) %>%
arrange(desc(rate)) %>%
select(-outliers_cnt)
#Visualization of outliers using plot_outlier()
data %>%
plot_outlier(Alter)
#Use the function of the dplyr package and plot_outlier()
#and diagnose_outlier() to visualize anomaly values
#of all numeric variables with an outlier ratio
#of 0.5% or more.:
data %>%
plot_outlier(diagnose_outlier(data) %>%
filter(outliers_ratio >= 0.5) %>%
select(variables) %>%
unlist())
data %>%
plot_outlier(diagnose_outlier(data) %>%
filter(outliers_ratio >= 0.5) %>%
select(variables) %>%
unlist())
data %>%
diagnose_report(output_format = "html",
output_file = "Diagn.html")
LS0tDQp0aXRsZTogIkVEQSINCnN1YnRpdGxlOiAiRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyB3aXRoIFIiDQphdXRob3I6ICJDZXZpIEhlcmRpYW4sIEIuIFNjIg0KZGF0ZTogIjMwLjAxLjIwMTkiDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6DQogICAgY29kZV9mb2xkaW5nOiBoaWRlDQogICAgaGlnaGxpZ2h0OiBweWdtZW50cw0KICAgIHRoZW1lOiBjb3Ntbw0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiA1DQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICBkZl9wcmludDogcGFnZWQNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzUnDQogIHBkZl9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzUnDQogIHdvcmRfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6ICc1Jw0KLS0tDQoNCg0KKipOb3RlOioqDQoNCi1BbiBSIE5vdGVib29rIGlzIGFuIFIgTWFya2Rvd24gZG9jdW1lbnQgd2l0aCBjaHVua3MgdGhhdCBjYW4gYmUgZXhlY3V0ZWQgaW5kZXBlbmRlbnRseSBhbmQgaW50ZXJhY3RpdmVseSwgd2l0aCBvdXRwdXQgdmlzaWJsZSBpbW1lZGlhdGVseSBiZW5lYXRoIHRoZSBpbnB1dC4NCg0KLU5vdGVib29rIG91dHB1dCBhcmUgYXZhaWxhYmxlIGFzIEhUTUwsIFBERiwgV29yZCwgb3IgTGF0ZXguIA0KDQotVGhpcyBOb3RlYm9vayBhcyBIVE1MIGlzIHByZWZlcmFibHkgb3BlbiB3aXRoIEdvb2dsZSBDaHJvbWUuDQoNCi1SLUNvZGUgY2FuIGJlIGV4dHJhY3RlZCBhcyBSbWQgZmlsZSB1bmRlciB0aGUgYnV0dG9uICJDb2RlIiBpbiB0aGUgbm90ZWJvb2suDQoNCi1UaGlzIE5vdGVib29rIHVzaW5nIGl0ZXJhdGl2ZSBkZXZlbG9wbWVudC4gSXQgbWVhbnMgdGhlIHByb2Nlc3Mgc3RhcnRzIHdpdGggYSBzaW1wbGUgaW1wbGVtZW50YXRpb24gb2YgYSBzbWFsbCBzZXQgb2YgaWRlYSByZXF1aXJlbWVudHMgYW5kIGl0ZXJhdGl2ZWx5IGVuaGFuY2VzIHRoZSBldm9sdmluZyB2ZXJzaW9ucyB1bnRpbCB0aGUgY29tcGxldGUgdmVyc2lvbiBpcyBpbXBsZW1lbnRlZCBhbmQgcGVyZmVjdC4NCg0KDQohW10oZWRhMS5qcGcpDQoNCg0KDQoNCmBgYHtyfQ0KDQojaHR0cHM6Ly93d3cuZXhjZWxyLmNvbS9leHBsb3JhdG9yeS1kYXRhLWFuYWx5c2lzLWluLWRhdGEtc2NpZW5jZS8NCg0KYGBgDQoNCg0KDQojR2VuZXJhbCBpZGVudGlmeWluZw0KDQoqIFZpZXcoZGF0YSkNCiogZ2xpbXBzZShkYXRhKSANCiogc3BlYyhkYXRhKSBmb3IgY3N2IGZpbGUNCiogYXR0cmlidXRlcyhkYXRhKQ0KKiBjbGFzcyhkYXRhKQ0KDQoNCg0KI0luIGRlcHRoIHN1bW1hcml6aW5nDQoNCioqMS1XaXRoIFN1bW1hcnkoKSBmcm9tIEJhc2U6KioNCg0KYGBge3J9DQpzdW1tYXJ5KGRhdGEpDQoNCiNHcm91cGluZyB3aXRoIHNvbWUgb2YgY2F0ZWdvcnkNCiNleGFtcGxlIGNhdGVnb3J5OiBhZ2UNCmJ5KGRhdGEsZGF0YSRhZ2UsIHN1bW1hcnkpDQpgYGANCg0KDQoqKjItc2tpbSgpLCBmcm9tIHRoZSBza2ltciBwYWNrYWdlOioqDQoNCmBgYHtyfQ0KDQppbnN0YWxsLnBhY2thZ2VzKCJza2ltciIpDQpsaWJyYXJ5KHNraW1yKQ0KIyBEZXNjcmlwdGl2ZSBzdGF0aXN0aWNzIA0Kc2tpbShkYXRhKQ0Kc2tpbShkYXRhX2V4Y2VsKQ0KDQojdXBkYXRlIA0KaW5zdGFsbC5wYWNrYWdlcygiZHBseXIiKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc2tpbXIpDQpncm91cF9ieShkYXRhLCBjYXRlZ29yeSkgJT4lIHNraW0oKQ0KDQpgYGANCg0KDQoqKjMtZGVzY3JpYmUsIGZyb20gdGhlIEhtaXNjIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoIkhtaXNjIikNCmxpYnJhcnkoSG1pc2MpDQpIbWlzYzo6ZGVzY3JpYmUoZGF0YSkNCmRlc2NyaWJlKGRhdGEpDQoNCmBgYA0KDQoNCioqNC1zdGF0LmRlc2MoKSwgZnJvbSB0aGUgcGFzdGVjcyBwYWNrYWdlOioqDQoNCmBgYHtyfQ0KDQppbnN0YWxsLnBhY2thZ2VzKCJwYXN0ZWNzIikNCmxpYnJhcnkocGFzdGVjcykNCnN0YXQuZGVzYyhkYXRhKQ0KDQpgYGANCg0KDQoqKjUtZGVzY3JpYmUgYW5kIGRlc2NyaWJlQnksIGZyb20gdGhlIHBzeWNoIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoInBzeWNoIikNCmxpYnJhcnkocHN5Y2gpDQpwc3ljaDo6ZGVzY3JpYmUoZGF0YSkNCmRlc2NyaWJlKGRhdGEpDQpwc3ljaDo6ZGVzY3JpYmVCeShkYXRhLCBkYXRhJHR5cGUpDQoNCiNUaGUg4oCcbWF04oCdIHBhcmFtZXRlciBkb2VzIGFsbG93IHlvdSB0byBwcm9kdWNlIA0KI2EgbWF0cml4IG91dHB1dCBvZiB0aGUgYWJvdmUuDQpwc3ljaDo6ZGVzY3JpYmVCeShkYXRhLCBkYXRhJHR5cGUsIG1hdCA9IFRSVUUpDQoNCmBgYA0KDQoqKjYtZGVzY3IgYW5kIGRmU3VtbWFyeSwgZnJvbSB0aGUgc3VtbWFyeXRvb2xzIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoInN1bW1hcnl0b29scyIpDQpsaWJyYXJ5KHN1bW1hcnl0b29scykNCnN1bW1hcnl0b29sczo6ZGVzY3IoZGF0YSkNCiNPbmx5IHdvcmtzIHdpdGggbnVtZXJpY2FsIGRhdGEuIA0KZGVzY3IoZGF0YSkNCg0KI2FzIGRhdGEuZnJhbWUNCmthYmxlKGFzLmRhdGEuZnJhbWUoc3VtbWFyeXRvb2xzOjpkZXNjcihkYXRhKSkpDQpzdW1tYXJ5dG9vbHM6OmRlc2NyKGRhdGEpDQoNCiN0cmFuc3Bvc2UNCnN1bW1hcnl0b29sczo6ZGVzY3IoZGF0YSwgdHJhbnNwb3NlID0gVFJVRSkNCg0KI0NvbXBsZXRlIG1lbnUgZnJvbSBzdW1tYXJ5dG9vbHMNCmRmU3VtbWFyeShkYXRhKQ0KDQpgYGANCg0KDQoqKjctQ3JlYXRlVGFibGVPbmUsIGZyb20gdGhlIHRhYmxlb25lIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoInRhYmxlb25lIikNCmxpYnJhcnkodGFibGVvbmUpDQoNCkNyZWF0ZVRhYmxlT25lKGRhdGEgPSBkYXRhKQ0Kc3VtbWFyeShDcmVhdGVUYWJsZU9uZShkYXRhID0gZGF0YSkpDQoNCkNyZWF0ZVRhYmxlT25lKHN0cmF0YSA9ICJjYXRlZ29yeSIsIGRhdGEgPSBkYXRhKQ0KDQojRm9yIGV4YW1wbGUsIGlmIHdlIHRoaW5rIOKAnHNjb3Jl4oCdIA0KI3Nob3VsZCBub3QgYmUgdHJlYXRlZCBhcyBub3JtYWw6DQpwcmludChDcmVhdGVUYWJsZU9uZShzdHJhdGEgPSAiY2F0ZWdvcnkiLCBkYXRhID0gZGF0YSksIA0Kbm9ubm9ybWFsID0gInNjb3JlIikNCg0KYGBgDQoNCg0KKio4LWRlc2N0YWJsZSwgZnJvbSB0aGUgZGVzY3RhYmxlIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoImRlc2N0YWJsZSIpDQpsaWJyYXJ5KGRlc2N0YWJsZSkNCmRlc2N0YWJsZShkYXRhKQ0KDQpncm91cF9ieShkYXRhLCBjYXRlZ29yeSkgJT4lDQpkZXNjdGFibGUoKQ0KIA0KI1RoaXMgZnVuY3Rpb24gaXMgc3VwZXIgY3VzdG9taXNhYmxlLg0KZGVzY3RhYmxlKGRhdGEsc3RhdHMgPSBsaXN0KCJOIiA9IGxlbmd0aCwgIk1lYW4iID0gbWVhbiwgDQoiU0QiID0gc2QsICJNaW4iID0gbWluLCAiTWF4IiA9IG1heCkpDQoNCmBgYA0KDQoNCg0KKio5LWdncGFpcnMsIGZyb20gdGhlIEdHYWxseSBwYWNrYWdlOioqDQoNCmBgYHtyfQ0KDQppbnN0YWxsLnBhY2thZ2VzKCJHR2FsbHkiKQ0KbGlicmFyeShHR2FsbHkpDQoNCmdncGFpcnMoZGF0YSkNCmdncGFpcnMoZGF0YSwgbWFwcGluZyA9IGFlcyhjb2xvdXIgPSBjYXRlZ29yeSkpDQoNCmBgYA0KDQoqKjEwLWRzX3N1bW1hcnlfc3RhdHMgZnJvbSBkZXNjcmlwdHI6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoImRlc2NyaXB0ciIpDQpsaWJyYXJ5KGRlc2NyaXB0cikNCg0KZHNfc3VtbWFyeV9zdGF0cyhkYXRhJHNjb3JlKQ0KZHNfc2NyZWVuZXIoZGF0YSkNCmRzX211bHRpX3N0YXRzKGZpbHRlcihkYXRhLCAhaXMubmEoc2NvcmUpKSwgc2NvcmUsIHJhdGluZykNCmRzX2ZyZXFfdGFibGUoZGF0YSRjYXRlZ29yeSkNCg0KYGBgDQoNCg0KKioxMS1XaXRoIGRsb29rcjogIEFuIGF1dG9tYXRlZCByZXBvcnQgKGFzIHBkZiBvciBodG1sKToqKg0KDQpgYGB7cn0NCg0KZGF0YTwtcXJ5X25ldWVfREwNClZpZXcoZGlhZ25vc2UoZGF0YSkpDQoNCmE8LWRhdGEgJT4lDQogIGRpYWdub3NlKCkgJT4lDQogIHNlbGVjdCgtdW5pcXVlX2NvdW50LCAtdW5pcXVlX3JhdGUpICU+JSANCiAgZmlsdGVyKG1pc3NpbmdfY291bnQgPiAwKSAlPiUgDQogIGFycmFuZ2UoZGVzYyhtaXNzaW5nX2NvdW50KSkNClZpZXcoYSkNCg0KVmlldyhkaWFnbm9zZV9udW1lcmljKGRhdGEpKQ0KVmlldyhkaWFnbm9zZV9jYXRlZ29yeShkYXRhKSkNCg0KI05vIDEuIE1pc3NpbmcgdmFsdWVzDQpkaWFnbm9zZV9jYXRlZ29yeShkYXRhKSAlPiUgDQogIGZpbHRlcihpcy5uYShsZXZlbHMpKQ0KDQojMC4wMSUgbGlzdCBsZXZlbHMNCmRhdGEgJT4lDQogIGRpYWdub3NlX2NhdGVnb3J5KHRvcCA9IDUwMCkgICU+JQ0KICBmaWx0ZXIocmF0aW8gPD0gMC4wMSkNCg0KDQojRGlhZ25vc2luZyBvdXRsaWVycyB3aXRoIGRpYWdub3NlX291dGxpZXIoKQ0KZGlhZ25vc2Vfb3V0bGllcihkYXRhKQ0KDQojTnVtZXJpYyB2YXJpYWJsZXMgdGhhdCBjb250YWluIGFub21hbGllcyBhcmUgZWFzaWx5IGZvdW5kIA0KI3dpdGggZmlsdGVyKCkuOg0KZGlhZ25vc2Vfb3V0bGllcihkYXRhKSAlPiUgDQogIGZpbHRlcihvdXRsaWVyc19jbnQgPiAwKSANCg0KI1RoZSBmb2xsb3dpbmcgaXMgYSBsaXN0IG9mIG51bWVyaWMgdmFyaWFibGVzIHdpdGggDQojYW5vbWFsaWVzIGdyZWF0ZXIgdGhhbiA1JS46DQpkaWFnbm9zZV9vdXRsaWVyKGRhdGEpICU+JSANCiAgZmlsdGVyKG91dGxpZXJzX3JhdGlvID4gNSkgJT4lIA0KICBtdXRhdGUocmF0ZSA9IG91dGxpZXJzX21lYW4gLyB3aXRoX21lYW4pICU+JSANCiAgYXJyYW5nZShkZXNjKHJhdGUpKSAlPiUgDQogIHNlbGVjdCgtb3V0bGllcnNfY250KQ0KDQojVmlzdWFsaXphdGlvbiBvZiBvdXRsaWVycyB1c2luZyBwbG90X291dGxpZXIoKQ0KZGF0YSAlPiUNCiAgcGxvdF9vdXRsaWVyKEFsdGVyKSANCg0KI1VzZSB0aGUgZnVuY3Rpb24gb2YgdGhlIGRwbHlyIHBhY2thZ2UgYW5kIHBsb3Rfb3V0bGllcigpIA0KI2FuZCBkaWFnbm9zZV9vdXRsaWVyKCkgdG8gdmlzdWFsaXplIGFub21hbHkgdmFsdWVzIA0KI29mIGFsbCBudW1lcmljIHZhcmlhYmxlcyB3aXRoIGFuIG91dGxpZXIgcmF0aW8gDQojb2YgMC41JSBvciBtb3JlLjoNCg0KZGF0YSAlPiUNCiAgcGxvdF9vdXRsaWVyKGRpYWdub3NlX291dGxpZXIoZGF0YSkgJT4lIA0KICAgICAgICAgICAgICAgICBmaWx0ZXIob3V0bGllcnNfcmF0aW8gPj0gMC41KSAlPiUgDQogICAgICAgICAgICAgICAgIHNlbGVjdCh2YXJpYWJsZXMpICU+JSANCiAgICAgICAgICAgICAgICAgdW5saXN0KCkpDQoNCmRhdGEgJT4lDQogIHBsb3Rfb3V0bGllcihkaWFnbm9zZV9vdXRsaWVyKGRhdGEpICU+JSANCiAgICAgICAgICAgICAgICAgZmlsdGVyKG91dGxpZXJzX3JhdGlvID49IDAuNSkgJT4lIA0KICAgICAgICAgICAgICAgICBzZWxlY3QodmFyaWFibGVzKSAlPiUgDQogICAgICAgICAgICAgICAgIHVubGlzdCgpKQ0KDQpkYXRhICU+JQ0KICBkaWFnbm9zZV9yZXBvcnQob3V0cHV0X2Zvcm1hdCA9ICJodG1sIiwgDQogIG91dHB1dF9maWxlID0gIkRpYWduLmh0bWwiKQ0KDQpgYGANCg0KDQojQ2hhbmdlIGxvZyB1cGRhdGUNCg0KKiAzMC4wOS4yMDE4DQoqIDMwLjAxLjIwMTkNCg0KPEJyPg0KDQojUHJlZmVyZW5jZXMNCg0KKiBodHRwczovL2Jvb2tkb3duLm9yZy9yZHBlbmcvZXhkYXRhL2V4cGxvcmF0b3J5LWRhdGEtYW5hbHlzaXMtY2hlY2tsaXN0Lmh0bWwgDQoqIGh0dHBzOi8vd3d3LnN0YXRpc3Rpay1uYWNoaGlsZmUuZGUvDQoqIGh0dHBzOi8vd3d3LmNyYXNoa3Vycy1zdGF0aXN0aWsuZGUvDQoqIGh0dHBzOi8vZGF0YXNjaWVuY2VwbHVzLmNvbS8NCiogaHR0cHM6Ly90b3dhcmRzZGF0YXNjaWVuY2UuY29tLw0KDQo8QnI+DQoNCiNMaWNlbnNlDQoNCltNSVRdKGh0dHBzOi8vb3BlbnNvdXJjZS5vcmcvbGljZW5zZXMvTUlUKQ==