Skip to content

Commit

Permalink
Merge pull request #3774 from nulib/4259-better-data-export
Browse files Browse the repository at this point in the history
Use CSV manifests instead of random IDs for export
  • Loading branch information
mbklein authored Jan 18, 2024
2 parents fa956ad + f4f55f1 commit 85ba946
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 25 deletions.
21 changes: 4 additions & 17 deletions app/lib/meadow/seed/export.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ defmodule Meadow.Seed.Export do
alias Ecto.Adapters.SQL
alias Meadow.Data.FileSets
alias Meadow.Data.Schemas.{FileSet, Work}
alias Meadow.Ingest.Schemas.Sheet
alias Meadow.Repo
alias Meadow.Seed.{Migration, Queries}

Expand All @@ -18,7 +17,6 @@ defmodule Meadow.Seed.Export do
@ingest_sheet_exports ~w(ingest_sheet_projects ingest_sheets ingest_sheet_rows ingest_sheet_progress
ingest_sheet_works ingest_sheet_file_sets ingest_sheet_action_states)a
@standalone_exports ~w(standalone_works standalone_file_sets standalone_action_states)a
@ingest_sheet_end_states ["file_fail", "row_fail", "completed"]

def export_manifest(bucket, prefix) do
manifest = %{last_migration_version: Migration.latest_version()} |> Jason.encode!()
Expand All @@ -35,18 +33,14 @@ defmodule Meadow.Seed.Export do

def export_ingest_sheets(_, _, nil), do: raise(ArgumentError, "Export requires a prefix")

def export_ingest_sheets(limit, bucket, prefix) do
from(s in Sheet, where: s.status in ^@ingest_sheet_end_states)
|> random_ids(limit)
|> export(@ingest_sheet_exports, bucket, prefix)
def export_ingest_sheets(ids, bucket, prefix) do
export(ids, @ingest_sheet_exports, bucket, prefix)
end

def export_standalone_works(_, _, nil), do: raise(ArgumentError, "Export requires a prefix")

def export_standalone_works(limit, bucket, prefix) do
from(w in Work, where: is_nil(w.ingest_sheet_id))
|> random_ids(limit)
|> export(@standalone_exports, bucket, prefix)
def export_standalone_works(ids, bucket, prefix) do
export(ids, @standalone_exports, bucket, prefix)
end

defp export(ids \\ [], file_list, bucket, prefix) do
Expand All @@ -62,13 +56,6 @@ defmodule Meadow.Seed.Export do
ids
end

defp random_ids(_, 0), do: [Ecto.UUID.generate()]

defp random_ids(queryable, limit) do
from(q in queryable, order_by: fragment("RANDOM()"), select: q.id, limit: ^limit)
|> Repo.all()
end

def ingest_sheet_assets(ingest_sheet_ids) do
from(w in Work,
join: fs in FileSet,
Expand Down
28 changes: 20 additions & 8 deletions app/lib/mix/tasks/seed/export.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ defmodule Mix.Tasks.Meadow.Seed.Export do
## Command line options
* `--ingest_sheets` - how many ingest sheets (with associated data) to export (default: `0`)
* `--works` - how many non-ingest-sheet works (with associated data) to export (default: `0`)
* `--ingest_sheets` - CSV file with ingest sheet IDs to export in the first column (default: `nil`)
* `--works` - CSV file with standalone work IDs to export in the first column (default: `nil`)
* `--bucket` - target S3 bucket (default: the configured Meadow uploads bucket)
* `--prefix` - (required) S3 prefix for exported assets
* `--skip-assets` - output data only, no preservation or pyramid files (default: `false`)
Expand All @@ -15,12 +15,13 @@ defmodule Mix.Tasks.Meadow.Seed.Export do
use Mix.Task

alias Meadow.Seed.Export
alias NimbleCSV.RFC4180, as: CSV

require Logger

@opts [
ingest_sheets: :integer,
works: :integer,
ingest_sheets: :string,
works: :string,
bucket: :string,
prefix: :string,
skip_assets: :boolean,
Expand All @@ -38,14 +39,16 @@ defmodule Mix.Tasks.Meadow.Seed.Export do
with {opts, _} <- OptionParser.parse!(args, strict: @opts) do
opts
|> Enum.into(%{
ingest_sheets: 0,
works: 0,
ingest_sheets: nil,
works: nil,
bucket: System.get_env("SHARED_BUCKET"),
prefix: nil,
skip_assets: false,
threads: 1
})
end
|> Map.update(:ingest_sheets, nil, &ids_from_csv/1)
|> Map.update(:works, nil, &ids_from_csv/1)

if missing?(parsed_opts.bucket), do: raise(ArgumentError, "Bucket is required")
if missing?(parsed_opts.prefix), do: raise(ArgumentError, "Prefix is required")
Expand All @@ -56,7 +59,7 @@ defmodule Mix.Tasks.Meadow.Seed.Export do
Logger.info("Exporting collections and nul_authorities")
Export.export_common(parsed_opts.bucket, parsed_opts.prefix)

Logger.info("Exporting #{parsed_opts.ingest_sheets} ingest sheets")
Logger.info("Exporting #{length(parsed_opts.ingest_sheets)} ingest sheets")

sheet_ids =
Export.export_ingest_sheets(
Expand All @@ -70,7 +73,7 @@ defmodule Mix.Tasks.Meadow.Seed.Export do
|> Export.export_assets(parsed_opts.bucket, parsed_opts.prefix, parsed_opts.threads)
end

Logger.info("Exporting #{parsed_opts.works} works")
Logger.info("Exporting #{length(parsed_opts.works)} works")

work_ids =
Export.export_standalone_works(parsed_opts.works, parsed_opts.bucket, parsed_opts.prefix)
Expand All @@ -84,4 +87,13 @@ defmodule Mix.Tasks.Meadow.Seed.Export do
end

def missing?(value), do: is_nil(value) or value == ""

defp ids_from_csv(filename) when is_binary(filename) and byte_size(filename) > 0 do
File.stream!(filename, [:trim_bom], :line)
|> CSV.parse_stream(skip_headers: false)
|> Stream.map(fn [id | _] -> id end)
|> Enum.to_list()
end

defp ids_from_csv(_), do: []
end

0 comments on commit 85ba946

Please sign in to comment.