diff --git a/app/lib/meadow/seed/export.ex b/app/lib/meadow/seed/export.ex index 127933454..49048cc2b 100644 --- a/app/lib/meadow/seed/export.ex +++ b/app/lib/meadow/seed/export.ex @@ -6,7 +6,6 @@ defmodule Meadow.Seed.Export do alias Ecto.Adapters.SQL alias Meadow.Data.FileSets alias Meadow.Data.Schemas.{FileSet, Work} - alias Meadow.Ingest.Schemas.Sheet alias Meadow.Repo alias Meadow.Seed.{Migration, Queries} @@ -18,7 +17,6 @@ defmodule Meadow.Seed.Export do @ingest_sheet_exports ~w(ingest_sheet_projects ingest_sheets ingest_sheet_rows ingest_sheet_progress ingest_sheet_works ingest_sheet_file_sets ingest_sheet_action_states)a @standalone_exports ~w(standalone_works standalone_file_sets standalone_action_states)a - @ingest_sheet_end_states ["file_fail", "row_fail", "completed"] def export_manifest(bucket, prefix) do manifest = %{last_migration_version: Migration.latest_version()} |> Jason.encode!() @@ -35,18 +33,14 @@ defmodule Meadow.Seed.Export do def export_ingest_sheets(_, _, nil), do: raise(ArgumentError, "Export requires a prefix") - def export_ingest_sheets(limit, bucket, prefix) do - from(s in Sheet, where: s.status in ^@ingest_sheet_end_states) - |> random_ids(limit) - |> export(@ingest_sheet_exports, bucket, prefix) + def export_ingest_sheets(ids, bucket, prefix) do + export(ids, @ingest_sheet_exports, bucket, prefix) end def export_standalone_works(_, _, nil), do: raise(ArgumentError, "Export requires a prefix") - def export_standalone_works(limit, bucket, prefix) do - from(w in Work, where: is_nil(w.ingest_sheet_id)) - |> random_ids(limit) - |> export(@standalone_exports, bucket, prefix) + def export_standalone_works(ids, bucket, prefix) do + export(ids, @standalone_exports, bucket, prefix) end defp export(ids \\ [], file_list, bucket, prefix) do @@ -62,13 +56,6 @@ defmodule Meadow.Seed.Export do ids end - defp random_ids(_, 0), do: [Ecto.UUID.generate()] - - defp random_ids(queryable, limit) do - from(q in queryable, order_by: fragment("RANDOM()"), select: q.id, limit: ^limit) - |> Repo.all() - end - def ingest_sheet_assets(ingest_sheet_ids) do from(w in Work, join: fs in FileSet, diff --git a/app/lib/mix/tasks/seed/export.ex b/app/lib/mix/tasks/seed/export.ex index a39d25f4a..fb1c746db 100644 --- a/app/lib/mix/tasks/seed/export.ex +++ b/app/lib/mix/tasks/seed/export.ex @@ -4,8 +4,8 @@ defmodule Mix.Tasks.Meadow.Seed.Export do ## Command line options - * `--ingest_sheets` - how many ingest sheets (with associated data) to export (default: `0`) - * `--works` - how many non-ingest-sheet works (with associated data) to export (default: `0`) + * `--ingest_sheets` - CSV file with ingest sheet IDs to export in the first column (default: `nil`) + * `--works` - CSV file with standalone work IDs to export in the first column (default: `nil`) * `--bucket` - target S3 bucket (default: the configured Meadow uploads bucket) * `--prefix` - (required) S3 prefix for exported assets * `--skip-assets` - output data only, no preservation or pyramid files (default: `false`) @@ -15,12 +15,13 @@ defmodule Mix.Tasks.Meadow.Seed.Export do use Mix.Task alias Meadow.Seed.Export + alias NimbleCSV.RFC4180, as: CSV require Logger @opts [ - ingest_sheets: :integer, - works: :integer, + ingest_sheets: :string, + works: :string, bucket: :string, prefix: :string, skip_assets: :boolean, @@ -38,14 +39,16 @@ defmodule Mix.Tasks.Meadow.Seed.Export do with {opts, _} <- OptionParser.parse!(args, strict: @opts) do opts |> Enum.into(%{ - ingest_sheets: 0, - works: 0, + ingest_sheets: nil, + works: nil, bucket: System.get_env("SHARED_BUCKET"), prefix: nil, skip_assets: false, threads: 1 }) end + |> Map.update(:ingest_sheets, nil, &ids_from_csv/1) + |> Map.update(:works, nil, &ids_from_csv/1) if missing?(parsed_opts.bucket), do: raise(ArgumentError, "Bucket is required") if missing?(parsed_opts.prefix), do: raise(ArgumentError, "Prefix is required") @@ -56,7 +59,7 @@ defmodule Mix.Tasks.Meadow.Seed.Export do Logger.info("Exporting collections and nul_authorities") Export.export_common(parsed_opts.bucket, parsed_opts.prefix) - Logger.info("Exporting #{parsed_opts.ingest_sheets} ingest sheets") + Logger.info("Exporting #{length(parsed_opts.ingest_sheets)} ingest sheets") sheet_ids = Export.export_ingest_sheets( @@ -70,7 +73,7 @@ defmodule Mix.Tasks.Meadow.Seed.Export do |> Export.export_assets(parsed_opts.bucket, parsed_opts.prefix, parsed_opts.threads) end - Logger.info("Exporting #{parsed_opts.works} works") + Logger.info("Exporting #{length(parsed_opts.works)} works") work_ids = Export.export_standalone_works(parsed_opts.works, parsed_opts.bucket, parsed_opts.prefix) @@ -84,4 +87,13 @@ defmodule Mix.Tasks.Meadow.Seed.Export do end def missing?(value), do: is_nil(value) or value == "" + + defp ids_from_csv(filename) when is_binary(filename) and byte_size(filename) > 0 do + File.stream!(filename, [:trim_bom], :line) + |> CSV.parse_stream(skip_headers: false) + |> Stream.map(fn [id | _] -> id end) + |> Enum.to_list() + end + + defp ids_from_csv(_), do: [] end