Skip to content

Commit

Permalink
added heuristic to parse numeric strings, closes #2
Browse files Browse the repository at this point in the history
  • Loading branch information
racinmat committed Mar 31, 2020
1 parent 8bb1c37 commit d33c9e3
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 51 deletions.
13 changes: 11 additions & 2 deletions src/schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,29 @@ types(e::Entry) = unique(typeof.(collect(keys(e.counts))))
Base.keys(e::Entry) = sort(collect(keys(e.counts)))
Base.isempty(e::Entry) = false

unify_types(e::Entry) = promote_type(unique(typeof.(keys(e.counts)))...)

function suggestextractor(e::Entry, settings = NamedTuple())
t = promote_type(unique(typeof.(keys(e.counts)))...)
t = unify_types(e::Entry)
t == Any && @error "JSON does not have a fixed type scheme, quitting"

for (c, ex) in get(settings, :scalar_extractors, default_scalar_extractor())
c(e) && return ex(e)
end
end

isfloat(s::AbstractString) = tryparse(Float64, s) isa Number
isint(s::AbstractString) = tryparse(Int64, s) isa Number

function default_scalar_extractor()
[(e -> (length(keys(e.counts)) / e.updated < 0.1 && length(keys(e.counts)) <= 10000),
e -> ExtractCategorical(collect(keys(e.counts)))),
(e -> unify_types(e::Entry) <: AbstractString && all(isint.(unique(keys(e.counts)))),
e -> extractscalar(Int64)),
(e -> unify_types(e::Entry) <: AbstractString && all(isfloat.(unique(keys(e.counts)))),
e -> extractscalar(Float64)),
(e -> true,
e -> extractscalar(promote_type(unique(typeof.(keys(e.counts)))...))),]
e -> extractscalar(unify_types(e::Entry))),]
end


Expand Down
66 changes: 17 additions & 49 deletions test/extractors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -199,52 +199,20 @@ end
@test ext[:e] isa ExtractBranch
@test isnothing(ext[:f])
end
#
# @testset "Extractor number " begin
# j1 = JSON.parse("""{"a": "1", "b": "a", "c": "1.1", "d": 1.1, "e": "1.2"}""")
# j2 = JSON.parse("""{"a": "2", "b": "b", "c": "2", "d": 2, "e": "1.3"}""")
# j3 = JSON.parse("""{"a": "3", "b": "c", "c": "2.3", "d": 2.3, "e": "1.4"}""")
# j4 = JSON.parse("""{"a": "4", "b": "c", "c": "5", "d": 5, "e": "1.4"}""")
#
# sch = JsonGrinder.schema([j1, j2, j3, j4])
# ext = suggestextractor(sch)
#
# isfloat(s::AbstractString) = tryparse(Float64, s) isa Number
# isint(s::AbstractString) = tryparse(Int64, s) isa Number
# isnumeric(s::AbstractString) = tryparse(Float64, s) isa Number
# isnumeric("5")
# isnumeric("5.5")
# isnumeric
# e = sch[:a]
#
# all(isint.(unique(keys(e.counts))))
# all(isfloat.(unique(keys(e.counts))))
#
# JsonGrinder.extractscalar(Int64)
#
# all(isfloat.(unique(keys(sch[:b].counts))))
# all(isint.(unique(keys(sch[:b].counts))))
#
# all(isfloat.(unique(keys(sch[:c].counts))))
# all(isint.(unique(keys(sch[:c].counts))))
#
# t = promote_type(unique(typeof.(keys(e.counts)))...)
# @test ext[:a] isa ExtractArray
# @test isnothing(ext[:b])
# @test isnothing(ext[:c])
# @test ext[:d] isa ExtractScalar
# @test ext[:e] isa ExtractBranch
# @test isnothing(ext[:f])
# end
#
#
# function default_scalar_extractor()
# [(e -> (length(keys(e.counts)) / e.updated < 0.1 && length(keys(e.counts)) <= 10000),
# e -> ExtractCategorical(collect(keys(e.counts)))),
# (e -> all(isint.(unique(keys(e.counts))))
# e -> extractscalar()),
# (e -> all(isfloat.(unique(keys(e.counts))))
# e -> extractscalar()),
# (e -> true,
# e -> extractscalar(promote_type(unique(typeof.(keys(e.counts)))...))),]
# end

@testset "Extractor of numbers as strings" begin
j1 = JSON.parse("""{"a": "1", "b": "a", "c": "1.1", "d": 1.1, "e": "1.2", "f": 1}""")
j2 = JSON.parse("""{"a": "2", "b": "b", "c": "2", "d": 2, "e": "1.3", "f": 2}""")
j3 = JSON.parse("""{"a": "3", "b": "c", "c": "2.3", "d": 2.3, "e": "1.4", "f": 3}""")
j4 = JSON.parse("""{"a": "4", "b": "c", "c": "5", "d": 5, "e": "1.4", "f": 3}""")

sch = JsonGrinder.schema([j1, j2, j3, j4])
ext = suggestextractor(sch)

@test ext[:a].datatype <: Int64
@test ext[:b].datatype <: String
@test ext[:c].datatype <: Float64
@test ext[:d].datatype <: Float64
@test ext[:e].datatype <: Float64
@test ext[:f].datatype <: Int64
end

0 comments on commit d33c9e3

Please sign in to comment.