Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Atomizer + PDF extractor #591

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
368 changes: 350 additions & 18 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[workspace]
members = [
"atomizer",
"server",
"cli",
"lib",
Expand Down
13 changes: 13 additions & 0 deletions atomizer/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
description = "Turn files into Atomic Data."
edition = "2021"
name = "atomizer"
version = "0.1.0"

[dependencies]
atomic_lib = {version = "0.34.0", path = "../lib"}
kamadak-exif = "0.5.5"
mime_guess = "2.0.4"
# Preferably use the OG version, but we're waiting for a
# https://github.com/jrmuizel/pdf-extract/pull/48
pdf-extract = {repository = "https://github.com/joepio/pdf-extract/"}
58 changes: 58 additions & 0 deletions atomizer/src/file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
use std::{collections::HashMap, error::Error, io::Read};

use atomic_lib::resources::PropVals;
use mime_guess::Mime;

pub struct File {
filename: String,
mime: Mime,
bytes: Vec<u8>,
}

impl File {
pub fn open(filename: &str) -> Result<File, Box<dyn Error>> {
let file = std::fs::File::open(filename)?;
let bytes = std::io::BufReader::new(file)
.bytes()
.collect::<Result<Vec<u8>, _>>()?;
let mime = mime_guess::from_path(filename).first_or_octet_stream();

Ok(File {
filename: filename.to_string(),
mime,
bytes,
})
}

pub fn from_filename_bytes(filename: &str, bytes: Vec<u8>) -> Result<File, Box<dyn Error>> {
let mime = mime_guess::from_path(filename).first_or_octet_stream();

Ok(File {
filename: filename.to_string(),
mime,
bytes,
})
}

/// Creates property-value combinations based on the file's contents.
/// Defaults to an empty HashMap if the file type is not supported.
pub fn to_propvals(self) -> PropVals {
match self.mime.to_string().as_str() {
"application/pdf" => crate::pdf::atomize(self),
"image/jpeg" => crate::image::atomize(self),
_ => HashMap::new(),
}
}

pub fn bytes(&mut self) -> Vec<u8> {
self.bytes.clone()
}

pub fn mime(&self) -> &Mime {
&self.mime
}

pub fn filename(&self) -> &str {
&self.filename
}
}
62 changes: 62 additions & 0 deletions atomizer/src/image.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use atomic_lib::resources::PropVals;
use exif::{In, Tag};

const DATE_TIME: &str = "date_time";

// These should map to Atomic Data Properties
fn map_tag(tag: Tag) -> String {
match tag {
Tag::PixelXDimension => "pixel_x_dimension",
Tag::XResolution => "x_resolution",
Tag::ImageDescription => "image_description",
Tag::DateTime => DATE_TIME,
_ => "unknown",
}
.to_string()
}

/// Extracts the location from an image file's EXIF data.
pub fn atomize(mut file: crate::file::File) -> PropVals {
let mut props = PropVals::new();

println!("Reading EXIF data from {}", file.filename());

let mut buf_reader = std::io::BufReader::new(std::io::Cursor::new(file.bytes()));
let exif = exif::Reader::new()
.read_from_container(&mut buf_reader)
.unwrap();

let tag_list = [
Tag::PixelXDimension,
Tag::XResolution,
Tag::ImageDescription,
Tag::DateTime,
];

for tag in tag_list {
if let Some(field) = exif.get_field(tag, In::PRIMARY) {
props.insert(
map_tag(tag),
atomic_lib::Value::String(field.display_value().to_string()),
);
println!("{}: {}", field.tag, field.display_value().with_unit(&exif));
}
}

props
}

#[cfg(test)]
mod tests {
use super::*;
use crate::file::File;

#[test]
fn load_image() {
let f = File::open("./test/image.jpg").unwrap();
let propvals = f.to_propvals();
let dt = propvals.get(DATE_TIME).unwrap();
println!("Date: {}", dt);
assert!(dt.to_string().contains("2008"));
}
}
3 changes: 3 additions & 0 deletions atomizer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub mod file;
mod image;
mod pdf;
26 changes: 26 additions & 0 deletions atomizer/src/pdf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
use atomic_lib::resources::PropVals;

const CONTENT_PROP: &str = atomic_lib::urls::DESCRIPTION;

/// Extracts the text from a PDF file.
pub fn atomize(mut file: crate::file::File) -> PropVals {
let mut props = PropVals::new();
let bytes = file.bytes();
let text = pdf_extract::extract_text_from_mem(&bytes).unwrap();
props.insert(CONTENT_PROP.into(), atomic_lib::Value::Markdown(text));
props
}

#[cfg(test)]
mod tests {
use super::*;
use crate::file::File;

#[test]
fn load_pdf() {
let f = File::open("./test/docs-demo.pdf").unwrap();
let propvals = f.to_propvals();
let content = propvals.get(CONTENT_PROP).unwrap();
assert!(content.to_string().contains("Atomic Data"));
}
}
Binary file added atomizer/test/docs-demo.pdf
Binary file not shown.
Binary file added atomizer/test/image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added atomizer/test/simple.pdf
Binary file not shown.
2 changes: 0 additions & 2 deletions lib/src/client.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
//! Functions for interacting with an Atomic Server
use url::Url;

use crate::{
agents::Agent,
commit::sign_message,
Expand Down
2 changes: 1 addition & 1 deletion lib/src/populate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{
parse::ParseOpts,
schema::{Class, Property},
storelike::Query,
urls, Storelike, Value,
urls, Resource, Storelike, Value,
};

/// Populates a store with some of the most fundamental Properties and Classes needed to bootstrap the whole.
Expand Down
1 change: 1 addition & 0 deletions server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ actix-cors = "0.6"
actix-files = "0.6"
actix-multipart = "0.4"
actix-web-actors = "4"
atomizer = { version = "0.1.0", path = "../atomizer" }
base64 = "0.13"
chrono = "0.4"
colored = "2"
Expand Down
15 changes: 9 additions & 6 deletions server/src/handlers/upload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ pub async fn upload_handler(

let mut file_path = appstate.config.uploads_path.clone();
file_path.push(&file_id);
let mut file = std::fs::File::create(file_path)?;
let mut file = std::fs::File::create(file_path.clone())?;

// Field in turn is stream of *Bytes* object
while let Some(chunk) = field.next().await {
Expand All @@ -87,17 +87,20 @@ pub async fn upload_handler(
let download_url = format!("{}/download/{}", store.get_server_url(), subject_path);

let mut resource = atomic_lib::Resource::new_instance(urls::FILE, store)?;
let mime = guess_mime_for_filename(filename);
resource.set_subject(new_subject);
resource.set_propval_string(urls::PARENT.into(), &query.parent, store)?;
resource.set_propval_string(urls::INTERNAL_ID.into(), &file_id, store)?;
resource.set_propval(urls::FILESIZE.into(), Value::Integer(byte_count), store)?;
resource.set_propval_string(
urls::MIMETYPE.into(),
&guess_mime_for_filename(filename),
store,
)?;
resource.set_propval_string(urls::MIMETYPE.into(), &mime, store)?;
resource.set_propval_string(urls::FILENAME.into(), filename, store)?;
resource.set_propval_string(urls::DOWNLOAD_URL.into(), &download_url, store)?;

// Extract data from files, turn into JSON-AD
for (prop, val) in atomizer::file::File::open(&file_path.to_string_lossy())?.to_propvals() {
resource.set_propval(prop, val, store)?;
}

commit_responses.push(resource.save(store)?);
created_resources.push(resource);
}
Expand Down