Note:

-An R Notebook is an R Markdown document with chunks that can be executed independently and interactively, with output visible immediately beneath the input.

-Notebook output are available as HTML, PDF, Word, or Latex.

-This Notebook as HTML is preferably open with Google Chrome.

-R-Code can be extracted as Rmd file under the button “Code” in the notebook.

-This Notebook using iterative development. It means the process starts with a simple implementation of a small set of idea requirements and iteratively enhances the evolving versions until the complete version is implemented and perfect.


#https://www.excelr.com/exploratory-data-analysis-in-data-science/

General identifying

  • View(data)
  • glimpse(data)
  • spec(data) for csv file
  • attributes(data)
  • class(data)

In depth summarizing

1-With Summary() from Base:

summary(data)

#Grouping with some of category
#example category: age
by(data,data$age, summary)

2-skim(), from the skimr package:


install.packages("skimr")
library(skimr)
# Descriptive statistics 
skim(data)
skim(data_excel)

#update 
install.packages("dplyr")
library(dplyr)
library(skimr)
group_by(data, category) %>% skim()

3-describe, from the Hmisc package:


install.packages("Hmisc")
library(Hmisc)
Hmisc::describe(data)
describe(data)

4-stat.desc(), from the pastecs package:


install.packages("pastecs")
library(pastecs)
stat.desc(data)

5-describe and describeBy, from the psych package:


install.packages("psych")
library(psych)
psych::describe(data)
describe(data)
psych::describeBy(data, data$type)

#The “mat” parameter does allow you to produce 
#a matrix output of the above.
psych::describeBy(data, data$type, mat = TRUE)

6-descr and dfSummary, from the summarytools package:


install.packages("summarytools")
library(summarytools)
summarytools::descr(data)
#Only works with numerical data. 
descr(data)

#as data.frame
kable(as.data.frame(summarytools::descr(data)))
summarytools::descr(data)

#transpose
summarytools::descr(data, transpose = TRUE)

#Complete menu from summarytools
dfSummary(data)

7-CreateTableOne, from the tableone package:


install.packages("tableone")
library(tableone)

CreateTableOne(data = data)
summary(CreateTableOne(data = data))

CreateTableOne(strata = "category", data = data)

#For example, if we think “score” 
#should not be treated as normal:
print(CreateTableOne(strata = "category", data = data), 
nonnormal = "score")

8-desctable, from the desctable package:


install.packages("desctable")
library(desctable)
desctable(data)

group_by(data, category) %>%
desctable()
 
#This function is super customisable.
desctable(data,stats = list("N" = length, "Mean" = mean, 
"SD" = sd, "Min" = min, "Max" = max))

9-ggpairs, from the GGally package:


install.packages("GGally")
library(GGally)

ggpairs(data)
ggpairs(data, mapping = aes(colour = category))

10-ds_summary_stats from descriptr:


install.packages("descriptr")
library(descriptr)

ds_summary_stats(data$score)
ds_screener(data)
ds_multi_stats(filter(data, !is.na(score)), score, rating)
ds_freq_table(data$category)

11-With dlookr: An automated report (as pdf or html):


data<-qry_neue_DL
View(diagnose(data))

a<-data %>%
  diagnose() %>%
  select(-unique_count, -unique_rate) %>% 
  filter(missing_count > 0) %>% 
  arrange(desc(missing_count))
View(a)

View(diagnose_numeric(data))
View(diagnose_category(data))

#No 1. Missing values
diagnose_category(data) %>% 
  filter(is.na(levels))

#0.01% list levels
data %>%
  diagnose_category(top = 500)  %>%
  filter(ratio <= 0.01)


#Diagnosing outliers with diagnose_outlier()
diagnose_outlier(data)

#Numeric variables that contain anomalies are easily found 
#with filter().:
diagnose_outlier(data) %>% 
  filter(outliers_cnt > 0) 

#The following is a list of numeric variables with 
#anomalies greater than 5%.:
diagnose_outlier(data) %>% 
  filter(outliers_ratio > 5) %>% 
  mutate(rate = outliers_mean / with_mean) %>% 
  arrange(desc(rate)) %>% 
  select(-outliers_cnt)

#Visualization of outliers using plot_outlier()
data %>%
  plot_outlier(Alter) 

#Use the function of the dplyr package and plot_outlier() 
#and diagnose_outlier() to visualize anomaly values 
#of all numeric variables with an outlier ratio 
#of 0.5% or more.:

data %>%
  plot_outlier(diagnose_outlier(data) %>% 
                 filter(outliers_ratio >= 0.5) %>% 
                 select(variables) %>% 
                 unlist())

data %>%
  plot_outlier(diagnose_outlier(data) %>% 
                 filter(outliers_ratio >= 0.5) %>% 
                 select(variables) %>% 
                 unlist())

data %>%
  diagnose_report(output_format = "html", 
  output_file = "Diagn.html")

Change log update

  • 30.09.2018
  • 30.01.2019


License

MIT

LS0tDQp0aXRsZTogIkVEQSINCnN1YnRpdGxlOiAiRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyB3aXRoIFIiDQphdXRob3I6ICJDZXZpIEhlcmRpYW4sIEIuIFNjIg0KZGF0ZTogIjMwLjAxLjIwMTkiDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6DQogICAgY29kZV9mb2xkaW5nOiBoaWRlDQogICAgaGlnaGxpZ2h0OiBweWdtZW50cw0KICAgIHRoZW1lOiBjb3Ntbw0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiA1DQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICBkZl9wcmludDogcGFnZWQNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzUnDQogIHBkZl9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzUnDQogIHdvcmRfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6ICc1Jw0KLS0tDQoNCg0KKipOb3RlOioqDQoNCi1BbiBSIE5vdGVib29rIGlzIGFuIFIgTWFya2Rvd24gZG9jdW1lbnQgd2l0aCBjaHVua3MgdGhhdCBjYW4gYmUgZXhlY3V0ZWQgaW5kZXBlbmRlbnRseSBhbmQgaW50ZXJhY3RpdmVseSwgd2l0aCBvdXRwdXQgdmlzaWJsZSBpbW1lZGlhdGVseSBiZW5lYXRoIHRoZSBpbnB1dC4NCg0KLU5vdGVib29rIG91dHB1dCBhcmUgYXZhaWxhYmxlIGFzIEhUTUwsIFBERiwgV29yZCwgb3IgTGF0ZXguIA0KDQotVGhpcyBOb3RlYm9vayBhcyBIVE1MIGlzIHByZWZlcmFibHkgb3BlbiB3aXRoIEdvb2dsZSBDaHJvbWUuDQoNCi1SLUNvZGUgY2FuIGJlIGV4dHJhY3RlZCBhcyBSbWQgZmlsZSB1bmRlciB0aGUgYnV0dG9uICJDb2RlIiBpbiB0aGUgbm90ZWJvb2suDQoNCi1UaGlzIE5vdGVib29rIHVzaW5nIGl0ZXJhdGl2ZSBkZXZlbG9wbWVudC4gSXQgbWVhbnMgdGhlIHByb2Nlc3Mgc3RhcnRzIHdpdGggYSBzaW1wbGUgaW1wbGVtZW50YXRpb24gb2YgYSBzbWFsbCBzZXQgb2YgaWRlYSByZXF1aXJlbWVudHMgYW5kIGl0ZXJhdGl2ZWx5IGVuaGFuY2VzIHRoZSBldm9sdmluZyB2ZXJzaW9ucyB1bnRpbCB0aGUgY29tcGxldGUgdmVyc2lvbiBpcyBpbXBsZW1lbnRlZCBhbmQgcGVyZmVjdC4NCg0KDQohW10oZWRhMS5qcGcpDQoNCg0KDQoNCmBgYHtyfQ0KDQojaHR0cHM6Ly93d3cuZXhjZWxyLmNvbS9leHBsb3JhdG9yeS1kYXRhLWFuYWx5c2lzLWluLWRhdGEtc2NpZW5jZS8NCg0KYGBgDQoNCg0KDQojR2VuZXJhbCBpZGVudGlmeWluZw0KDQoqIFZpZXcoZGF0YSkNCiogZ2xpbXBzZShkYXRhKSANCiogc3BlYyhkYXRhKSBmb3IgY3N2IGZpbGUNCiogYXR0cmlidXRlcyhkYXRhKQ0KKiBjbGFzcyhkYXRhKQ0KDQoNCg0KI0luIGRlcHRoIHN1bW1hcml6aW5nDQoNCioqMS1XaXRoIFN1bW1hcnkoKSBmcm9tIEJhc2U6KioNCg0KYGBge3J9DQpzdW1tYXJ5KGRhdGEpDQoNCiNHcm91cGluZyB3aXRoIHNvbWUgb2YgY2F0ZWdvcnkNCiNleGFtcGxlIGNhdGVnb3J5OiBhZ2UNCmJ5KGRhdGEsZGF0YSRhZ2UsIHN1bW1hcnkpDQpgYGANCg0KDQoqKjItc2tpbSgpLCBmcm9tIHRoZSBza2ltciBwYWNrYWdlOioqDQoNCmBgYHtyfQ0KDQppbnN0YWxsLnBhY2thZ2VzKCJza2ltciIpDQpsaWJyYXJ5KHNraW1yKQ0KIyBEZXNjcmlwdGl2ZSBzdGF0aXN0aWNzIA0Kc2tpbShkYXRhKQ0Kc2tpbShkYXRhX2V4Y2VsKQ0KDQojdXBkYXRlIA0KaW5zdGFsbC5wYWNrYWdlcygiZHBseXIiKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc2tpbXIpDQpncm91cF9ieShkYXRhLCBjYXRlZ29yeSkgJT4lIHNraW0oKQ0KDQpgYGANCg0KDQoqKjMtZGVzY3JpYmUsIGZyb20gdGhlIEhtaXNjIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoIkhtaXNjIikNCmxpYnJhcnkoSG1pc2MpDQpIbWlzYzo6ZGVzY3JpYmUoZGF0YSkNCmRlc2NyaWJlKGRhdGEpDQoNCmBgYA0KDQoNCioqNC1zdGF0LmRlc2MoKSwgZnJvbSB0aGUgcGFzdGVjcyBwYWNrYWdlOioqDQoNCmBgYHtyfQ0KDQppbnN0YWxsLnBhY2thZ2VzKCJwYXN0ZWNzIikNCmxpYnJhcnkocGFzdGVjcykNCnN0YXQuZGVzYyhkYXRhKQ0KDQpgYGANCg0KDQoqKjUtZGVzY3JpYmUgYW5kIGRlc2NyaWJlQnksIGZyb20gdGhlIHBzeWNoIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoInBzeWNoIikNCmxpYnJhcnkocHN5Y2gpDQpwc3ljaDo6ZGVzY3JpYmUoZGF0YSkNCmRlc2NyaWJlKGRhdGEpDQpwc3ljaDo6ZGVzY3JpYmVCeShkYXRhLCBkYXRhJHR5cGUpDQoNCiNUaGUg4oCcbWF04oCdIHBhcmFtZXRlciBkb2VzIGFsbG93IHlvdSB0byBwcm9kdWNlIA0KI2EgbWF0cml4IG91dHB1dCBvZiB0aGUgYWJvdmUuDQpwc3ljaDo6ZGVzY3JpYmVCeShkYXRhLCBkYXRhJHR5cGUsIG1hdCA9IFRSVUUpDQoNCmBgYA0KDQoqKjYtZGVzY3IgYW5kIGRmU3VtbWFyeSwgZnJvbSB0aGUgc3VtbWFyeXRvb2xzIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoInN1bW1hcnl0b29scyIpDQpsaWJyYXJ5KHN1bW1hcnl0b29scykNCnN1bW1hcnl0b29sczo6ZGVzY3IoZGF0YSkNCiNPbmx5IHdvcmtzIHdpdGggbnVtZXJpY2FsIGRhdGEuIA0KZGVzY3IoZGF0YSkNCg0KI2FzIGRhdGEuZnJhbWUNCmthYmxlKGFzLmRhdGEuZnJhbWUoc3VtbWFyeXRvb2xzOjpkZXNjcihkYXRhKSkpDQpzdW1tYXJ5dG9vbHM6OmRlc2NyKGRhdGEpDQoNCiN0cmFuc3Bvc2UNCnN1bW1hcnl0b29sczo6ZGVzY3IoZGF0YSwgdHJhbnNwb3NlID0gVFJVRSkNCg0KI0NvbXBsZXRlIG1lbnUgZnJvbSBzdW1tYXJ5dG9vbHMNCmRmU3VtbWFyeShkYXRhKQ0KDQpgYGANCg0KDQoqKjctQ3JlYXRlVGFibGVPbmUsIGZyb20gdGhlIHRhYmxlb25lIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoInRhYmxlb25lIikNCmxpYnJhcnkodGFibGVvbmUpDQoNCkNyZWF0ZVRhYmxlT25lKGRhdGEgPSBkYXRhKQ0Kc3VtbWFyeShDcmVhdGVUYWJsZU9uZShkYXRhID0gZGF0YSkpDQoNCkNyZWF0ZVRhYmxlT25lKHN0cmF0YSA9ICJjYXRlZ29yeSIsIGRhdGEgPSBkYXRhKQ0KDQojRm9yIGV4YW1wbGUsIGlmIHdlIHRoaW5rIOKAnHNjb3Jl4oCdIA0KI3Nob3VsZCBub3QgYmUgdHJlYXRlZCBhcyBub3JtYWw6DQpwcmludChDcmVhdGVUYWJsZU9uZShzdHJhdGEgPSAiY2F0ZWdvcnkiLCBkYXRhID0gZGF0YSksIA0Kbm9ubm9ybWFsID0gInNjb3JlIikNCg0KYGBgDQoNCg0KKio4LWRlc2N0YWJsZSwgZnJvbSB0aGUgZGVzY3RhYmxlIHBhY2thZ2U6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoImRlc2N0YWJsZSIpDQpsaWJyYXJ5KGRlc2N0YWJsZSkNCmRlc2N0YWJsZShkYXRhKQ0KDQpncm91cF9ieShkYXRhLCBjYXRlZ29yeSkgJT4lDQpkZXNjdGFibGUoKQ0KIA0KI1RoaXMgZnVuY3Rpb24gaXMgc3VwZXIgY3VzdG9taXNhYmxlLg0KZGVzY3RhYmxlKGRhdGEsc3RhdHMgPSBsaXN0KCJOIiA9IGxlbmd0aCwgIk1lYW4iID0gbWVhbiwgDQoiU0QiID0gc2QsICJNaW4iID0gbWluLCAiTWF4IiA9IG1heCkpDQoNCmBgYA0KDQoNCg0KKio5LWdncGFpcnMsIGZyb20gdGhlIEdHYWxseSBwYWNrYWdlOioqDQoNCmBgYHtyfQ0KDQppbnN0YWxsLnBhY2thZ2VzKCJHR2FsbHkiKQ0KbGlicmFyeShHR2FsbHkpDQoNCmdncGFpcnMoZGF0YSkNCmdncGFpcnMoZGF0YSwgbWFwcGluZyA9IGFlcyhjb2xvdXIgPSBjYXRlZ29yeSkpDQoNCmBgYA0KDQoqKjEwLWRzX3N1bW1hcnlfc3RhdHMgZnJvbSBkZXNjcmlwdHI6KioNCg0KYGBge3J9DQoNCmluc3RhbGwucGFja2FnZXMoImRlc2NyaXB0ciIpDQpsaWJyYXJ5KGRlc2NyaXB0cikNCg0KZHNfc3VtbWFyeV9zdGF0cyhkYXRhJHNjb3JlKQ0KZHNfc2NyZWVuZXIoZGF0YSkNCmRzX211bHRpX3N0YXRzKGZpbHRlcihkYXRhLCAhaXMubmEoc2NvcmUpKSwgc2NvcmUsIHJhdGluZykNCmRzX2ZyZXFfdGFibGUoZGF0YSRjYXRlZ29yeSkNCg0KYGBgDQoNCg0KKioxMS1XaXRoIGRsb29rcjogIEFuIGF1dG9tYXRlZCByZXBvcnQgKGFzIHBkZiBvciBodG1sKToqKg0KDQpgYGB7cn0NCg0KZGF0YTwtcXJ5X25ldWVfREwNClZpZXcoZGlhZ25vc2UoZGF0YSkpDQoNCmE8LWRhdGEgJT4lDQogIGRpYWdub3NlKCkgJT4lDQogIHNlbGVjdCgtdW5pcXVlX2NvdW50LCAtdW5pcXVlX3JhdGUpICU+JSANCiAgZmlsdGVyKG1pc3NpbmdfY291bnQgPiAwKSAlPiUgDQogIGFycmFuZ2UoZGVzYyhtaXNzaW5nX2NvdW50KSkNClZpZXcoYSkNCg0KVmlldyhkaWFnbm9zZV9udW1lcmljKGRhdGEpKQ0KVmlldyhkaWFnbm9zZV9jYXRlZ29yeShkYXRhKSkNCg0KI05vIDEuIE1pc3NpbmcgdmFsdWVzDQpkaWFnbm9zZV9jYXRlZ29yeShkYXRhKSAlPiUgDQogIGZpbHRlcihpcy5uYShsZXZlbHMpKQ0KDQojMC4wMSUgbGlzdCBsZXZlbHMNCmRhdGEgJT4lDQogIGRpYWdub3NlX2NhdGVnb3J5KHRvcCA9IDUwMCkgICU+JQ0KICBmaWx0ZXIocmF0aW8gPD0gMC4wMSkNCg0KDQojRGlhZ25vc2luZyBvdXRsaWVycyB3aXRoIGRpYWdub3NlX291dGxpZXIoKQ0KZGlhZ25vc2Vfb3V0bGllcihkYXRhKQ0KDQojTnVtZXJpYyB2YXJpYWJsZXMgdGhhdCBjb250YWluIGFub21hbGllcyBhcmUgZWFzaWx5IGZvdW5kIA0KI3dpdGggZmlsdGVyKCkuOg0KZGlhZ25vc2Vfb3V0bGllcihkYXRhKSAlPiUgDQogIGZpbHRlcihvdXRsaWVyc19jbnQgPiAwKSANCg0KI1RoZSBmb2xsb3dpbmcgaXMgYSBsaXN0IG9mIG51bWVyaWMgdmFyaWFibGVzIHdpdGggDQojYW5vbWFsaWVzIGdyZWF0ZXIgdGhhbiA1JS46DQpkaWFnbm9zZV9vdXRsaWVyKGRhdGEpICU+JSANCiAgZmlsdGVyKG91dGxpZXJzX3JhdGlvID4gNSkgJT4lIA0KICBtdXRhdGUocmF0ZSA9IG91dGxpZXJzX21lYW4gLyB3aXRoX21lYW4pICU+JSANCiAgYXJyYW5nZShkZXNjKHJhdGUpKSAlPiUgDQogIHNlbGVjdCgtb3V0bGllcnNfY250KQ0KDQojVmlzdWFsaXphdGlvbiBvZiBvdXRsaWVycyB1c2luZyBwbG90X291dGxpZXIoKQ0KZGF0YSAlPiUNCiAgcGxvdF9vdXRsaWVyKEFsdGVyKSANCg0KI1VzZSB0aGUgZnVuY3Rpb24gb2YgdGhlIGRwbHlyIHBhY2thZ2UgYW5kIHBsb3Rfb3V0bGllcigpIA0KI2FuZCBkaWFnbm9zZV9vdXRsaWVyKCkgdG8gdmlzdWFsaXplIGFub21hbHkgdmFsdWVzIA0KI29mIGFsbCBudW1lcmljIHZhcmlhYmxlcyB3aXRoIGFuIG91dGxpZXIgcmF0aW8gDQojb2YgMC41JSBvciBtb3JlLjoNCg0KZGF0YSAlPiUNCiAgcGxvdF9vdXRsaWVyKGRpYWdub3NlX291dGxpZXIoZGF0YSkgJT4lIA0KICAgICAgICAgICAgICAgICBmaWx0ZXIob3V0bGllcnNfcmF0aW8gPj0gMC41KSAlPiUgDQogICAgICAgICAgICAgICAgIHNlbGVjdCh2YXJpYWJsZXMpICU+JSANCiAgICAgICAgICAgICAgICAgdW5saXN0KCkpDQoNCmRhdGEgJT4lDQogIHBsb3Rfb3V0bGllcihkaWFnbm9zZV9vdXRsaWVyKGRhdGEpICU+JSANCiAgICAgICAgICAgICAgICAgZmlsdGVyKG91dGxpZXJzX3JhdGlvID49IDAuNSkgJT4lIA0KICAgICAgICAgICAgICAgICBzZWxlY3QodmFyaWFibGVzKSAlPiUgDQogICAgICAgICAgICAgICAgIHVubGlzdCgpKQ0KDQpkYXRhICU+JQ0KICBkaWFnbm9zZV9yZXBvcnQob3V0cHV0X2Zvcm1hdCA9ICJodG1sIiwgDQogIG91dHB1dF9maWxlID0gIkRpYWduLmh0bWwiKQ0KDQpgYGANCg0KDQojQ2hhbmdlIGxvZyB1cGRhdGUNCg0KKiAzMC4wOS4yMDE4DQoqIDMwLjAxLjIwMTkNCg0KPEJyPg0KDQojUHJlZmVyZW5jZXMNCg0KKiBodHRwczovL2Jvb2tkb3duLm9yZy9yZHBlbmcvZXhkYXRhL2V4cGxvcmF0b3J5LWRhdGEtYW5hbHlzaXMtY2hlY2tsaXN0Lmh0bWwgDQoqIGh0dHBzOi8vd3d3LnN0YXRpc3Rpay1uYWNoaGlsZmUuZGUvDQoqIGh0dHBzOi8vd3d3LmNyYXNoa3Vycy1zdGF0aXN0aWsuZGUvDQoqIGh0dHBzOi8vZGF0YXNjaWVuY2VwbHVzLmNvbS8NCiogaHR0cHM6Ly90b3dhcmRzZGF0YXNjaWVuY2UuY29tLw0KDQo8QnI+DQoNCiNMaWNlbnNlDQoNCltNSVRdKGh0dHBzOi8vb3BlbnNvdXJjZS5vcmcvbGljZW5zZXMvTUlUKQ==