fixes @summary (#124)

* fixes `@summary` * Added support for non-numeric columns, minor tweaks to column names. --------- Co-authored-by: Karandeep Singh <karandeep@gmail.com>
TidierOrg · Dec 28, 2024 · afdc188 · afdc188
1 parent 70b35d4
commit afdc188
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 14 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # TidierData.jl updates
 
+## v16.3
+- Bugfix: `@summary` no longer errors with non-numeric columns. Instead, it only reports non-numeric summary stats on non-numeric columns. Minor changes to summary column names to be lowercase and snakecase.
+
 ## v0.16.2 - 2024-09-03
 - Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
 - Adds `@head`

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.16.2"
+version = "0.16.3"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -2415,7 +2415,8 @@ For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, media
 julia> df = DataFrame(a = [1, 2, 3, 4, 5],
                       b = [missing, 7, 8, 9, 10],
                       c = [11, missing, 13, 14, missing],
-                      d = [16, 17, 18, 19, 20]);
+                      d = [16.1, 17.2, 18.3, 19.4, 20.5],
+                      e = ["a", "a", "a", "a", "a"]);
 
 julia> @summary(df);
 

diff --git a/src/summary.jl b/src/summary.jl
@@ -3,18 +3,37 @@ function summary_stats(df::DataFrame)
     summary_data = []
     for column in colnames
         col = df[:, column]
-        col_nonmissing = collect(skipmissing(col))
-        push!(summary_data, (
-            Column = column,
-            Min = minimum(col_nonmissing),
-            Q1 = quantile(col_nonmissing, 0.25),
-            Median = median(col_nonmissing),
-            Mean = mean(col_nonmissing),
-            Q3 = quantile(col_nonmissing, 0.75),
-            Max = maximum(col_nonmissing),
-            Count = length(col_nonmissing),
-            Missing_Count = count(ismissing, col)
-        ))
+        if eltype(col) <: Union{Number, Missing}
+            col_nonmissing = collect(skipmissing(col))
+            push!(summary_data, (
+                column = column,
+                min = minimum(col_nonmissing),
+                q1 = quantile(col_nonmissing, 0.25),
+                median = median(col_nonmissing),
+                mean = mean(col_nonmissing),
+                q3 = quantile(col_nonmissing, 0.75),
+                max = maximum(col_nonmissing),
+                non_missing_values = length(col_nonmissing),
+                missing_values = count(ismissing, col),
+                total_values = length(col),
+                unique_values = length(unique(col_nonmissing))
+            ))
+        else
+            col_nonmissing = collect(skipmissing(col))
+            push!(summary_data, (
+                column = column,
+                min = nothing,
+                q1 = nothing,
+                median = nothing,
+                mean = nothing,
+                q3 = nothing,
+                max = nothing,
+                non_missing_values = length(col_nonmissing),
+                missing_values = count(ismissing, col),
+                total_values = length(col),
+                unique_values = length(unique(col_nonmissing))
+            ))
+        end
     end
     return DataFrame(summary_data)
 end