Skip to content

Commit

Permalink
first
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmadtourei committed Jul 3, 2024
1 parent 217f332 commit daacd35
Show file tree
Hide file tree
Showing 12 changed files with 935 additions and 1 deletion.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,11 @@
# das-anomaly
A Python package for an autoencoder-based algorithm to detect anomalies in distributed acoustic sensing (DAS) datasets.
Python scripts for an autoencoder-based algorithm to detect anomalies in distributed acoustic sensing (DAS) datasets.

The main steps are as follows:
1. Plot power spectral density (PSD) plots in RGB format. We average the energy over a desired time window and stack all channels together to create a PSD with channels on the X-axis and frequency on the Y-axis. We create PSD of normal images (images without any anomaly or seismic event) and known seismic events. We can use MPI to distribute plotting PSDs over CPUs.
2. Train the model on normal PSD images.
3. Use the trained model to detect anomalies in PSDs.
4. Count number of detected anomalies (if needed).

Contact: Ahmad Tourei, Colorado School of Mines
tourei@mines.edu | ahmadtourei@gmail.com
Binary file added source/__pycache__/utils.cpython-311.pyc
Binary file not shown.
26 changes: 26 additions & 0 deletions source/count_anomalies/count_anomalies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
Count number of anomalies from the result (saved) text files.
"""
import os
import sys

current_dir = os.path.dirname(os.path.abspath(__file__))
source_dir = os.path.join(current_dir, '..')
sys.path.append(source_dir)
from utils import search_keyword_in_files


root = '/globalscratch/ahmad9/caserm/spectrum_analysis/results/'
result_folder = 'UTC-YMD20220609-HMS124917.291'

keyword = 'anomaly'

directory = os.path.join(root, result_folder)

total_count, lines = search_keyword_in_files(directory, keyword)

with open(root + 'anomalies_' + result_folder + '.txt', 'w') as file:
for line in lines:
file.write(f"{line}\n")

print(f"Total occurrences of the '{keyword}':", total_count)
14 changes: 14 additions & 0 deletions source/count_anomalies/count_anomalies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
#SBATCH -t 00:10:00
#SBATCH -A casermminefiber
#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_count_anomaly_%j.out
#SBATCH --mem-per-cpu=16G

# Print start time
echo "Job started at: $(date)"

source activate casermml
python count_anomalies.py

# Print end time
echo "Job ended at: $(date)"
113 changes: 113 additions & 0 deletions source/detect_anomalies/detect_anomalies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Use the train model to detect anomalies (potential seismic events.)
"""
import glob
import json
import numpy as np
import os
import sys

from keras.models import load_model
from sklearn.neighbors import KernelDensity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator

current_dir = os.path.dirname(os.path.abspath(__file__))
source_dir = os.path.join(current_dir, '..')
sys.path.append(source_dir)
from utils import density, check_if_anomaly


# Size of the input images
size = 128

# Define the path to the results
results_path = '/globalscratch/ahmad9/caserm/spectrum_analysis/results/'
model_path = results_path + f'model_1_{size}.h5'
loaded_model = load_model(model_path)

# Define the path to power spectral density (PSD) plots
data_spool = 'UTC-YMD20220617-HMS155316.989'
root_dir = f'/globalscratch/ahmad9/caserm/spectrum_analysis/spectrum_plots/{data_spool}/'

# Define your desired threshold for density score
density_threshold = 15934.15

# Define generators for training, validation and, anomaly data.
batch_size = 64
datagen = ImageDataGenerator(rescale=1./255)

# Create the train generator (with same parameters as for the trained model)
train_generator = datagen.flow_from_directory(
"",
target_size=(size, size),
batch_size=batch_size,
class_mode='input'
)

# Define the autoencoder
# Encoder
model = Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3)))
model.add(MaxPooling2D((2, 2), padding='same'))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))
model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))

# Decoder
model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))

model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Read the history of the trained model
with open(os.path.join(results_path, 'history_1_128.json'), 'r') as json_file:
history_dict = json.load(json_file)

# Extract the encoder network, with trained weights
encoder_model = Sequential()
# Add the convolutional layer without weights
encoder_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3)))
# Set the weights from the corresponding layer of the loaded model
encoder_model.layers[-1].set_weights(loaded_model.layers[0].get_weights())
encoder_model.add(MaxPooling2D((2, 2), padding='same'))
encoder_model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
encoder_model.layers[-1].set_weights(loaded_model.layers[2].get_weights())
encoder_model.add(MaxPooling2D((2, 2), padding='same'))
encoder_model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
encoder_model.layers[-1].set_weights(loaded_model.layers[4].get_weights())
encoder_model.add(MaxPooling2D((2, 2), padding='same'))

# Calculate KDE of latent space using sklearn and determine if PSD is an anomaly
encoded_images = encoder_model.predict(train_generator)
encoder_output_shape = encoder_model.output_shape
out_vector_shape = encoder_output_shape[1]*encoder_output_shape[2]*encoder_output_shape[3]

encoded_images_vector = [np.reshape(img, (out_vector_shape)) for img in encoded_images]

# Fit KDE to the image latent data
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_vector)

# Write whether the image is an anomaly in a text file
for folder_name in os.listdir(root_dir):
folder_path = os.path.join(root_dir, folder_name)
if os.path.isdir(folder_path):
# Construct the glob pattern for PSD file paths in the current folder
spectrum_file_pattern = os.path.join(folder_path, '*')
spectrum_file_paths = glob.glob(spectrum_file_pattern)
# SSave the the results in a text file
with open(results_path + data_spool + '/' + str(folder_name) + '_output_model_1_128_anomaly.txt', 'w') as file:
for i in range (0,len(spectrum_file_paths)):
print(
f"Line {i}, image {spectrum_file_paths[i]}:
{check_if_anomaly(encoder_model=encoder_model, size=size, img_path=spectrum_file_paths[i], density_threshold=density_threshold, kde=kde)}",
file=file
)
14 changes: 14 additions & 0 deletions source/detect_anomalies/detect_anomalies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
#SBATCH -t 36:00:00
#SBATCH -A casermminefiber
#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_event_detection_%j.out
#SBATCH --mem-per-cpu=128G

# Print start time
echo "Job started at: $(date)"

source activate casermml
python detect_anomalies.py

# Print end time
echo "Job ended at: $(date)"
89 changes: 89 additions & 0 deletions source/plot_psd/psd_MPI.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Plot power spectral density (PSD) plots using MPI.
"""
import numpy as np
import os
import sys
import time

import dascore as dc
from mpi4py import MPI

current_dir = os.path.dirname(os.path.abspath(__file__))
source_dir = os.path.join(current_dir, '..')
sys.path.append(source_dir)
from utils import plot_spec


if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
os.system('set HDF5_USE_FILE=FALSE')

# Intials
t0 = time.time()
data_path = '/projects/casermminefiber/UTC-YMD20220617-HMS155316.989/'
fig_path = '/globalscratch/ahmad9/caserm/spectrum_analysis/spectrum_plots/UTC-YMD20220617-HMS155316.989/'

# Initiate MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

if rank==0:
sp = dc.spool(data_path).update()
num_patch = len(sp)

print("Number of patches in the spool: ", num_patch)

splits = num_patch
if splits<1:
raise ValueError('No patch of DAS data found within data path: %s'%(data_path))

patch = sp[0]
time_step = patch.coords.step("time")
sampling_rate = int(1/(time_step / np.timedelta64(1, 's')))
num_sec = patch.seconds
distance_step = round(patch.coords.step("distance"),5)
else:
splits = sp = sampling_rate = num_sec = distance_step = None

# Broadcast the variables
splits = comm.bcast(splits, root=0)
sp = comm.bcast(sp, root=0)
sampling_rate = comm.bcast(sampling_rate, root=0)
num_sec = comm.bcast(num_sec, root=0)
distance_step = comm.bcast(distance_step, root=0)

# Set parameters for preprocessing the data
gauge_length = distance_step*2
start_channel = 100
end_channel = 375
min_freq = 0
max_freq = 0.9*0.5*sampling_rate
time_window = 2 # sec
time_overlap = 1 # sec

# Loop over files (MPI)
for i in range(rank,splits,size):
print(f"wroking on patch number: {i}")
patch = sp[i].transpose("time", "distance")
das_data = patch.data
# Velocity to strain rate
gauge_samples = int(round(gauge_length / distance_step))
strain_rate = das_data[:, gauge_samples:] - das_data[:, :-gauge_samples]
strain_rate = strain_rate / gauge_length

num_digits = len(str(int(num_sec)))

# Loop over time windows and plot the PSDs
for j in range(0,int(num_sec),time_overlap):
start_time = j
end_time = start_time + time_window
if end_time>num_sec:
continue
title = os.path.splitext(str(sp.get_contents()["path"][i]))[0]+'_'+str(start_time).zfill(num_digits)+'-'+str(end_time).zfill(num_digits)+'sec'
output_rank = str(rank).zfill(int(len(str(int(size)))))
plot_spec(strain_rate, start_time, end_time, start_channel, end_channel, min_freq, max_freq, sampling_rate, title, output_rank, fig_path)

sys.exit()
16 changes: 16 additions & 0 deletions source/plot_psd/psd_MPI.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --ntasks=250
#SBATCH -t 12:00:00
#SBATCH -A casermminefiber
#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_MPI_%j.out
#SBATCH --mem-per-cpu=16G

# Print start time
echo "Job started at: $(date)"

module load openmpi/gcc/64/4.0.4
source activate casermmpi
mpirun -n ${SLURM_NTASKS} python spectrum_MPI.py

# Print end time
echo "Job ended at: $(date)"
105 changes: 105 additions & 0 deletions source/train_model/train_ae.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""
This script uses unsupervised autoencoders to train a deep learning model for anomaly detection in images.
For microseismic event detection, images can be the power spectral density (PSD) plots.
We will consider the bottleneck layer output from our autoencoder as the latent space.
Using the reconstruction error and kernel density estimation (KDE) based on the vectors in the latent space, anomalies can be detected.
"""
import json
import os
import sys

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator

current_dir = os.path.dirname(os.path.abspath(__file__))
source_dir = os.path.join(current_dir, '..')
sys.path.append(source_dir)
from utils import plot_train_test_loss


# Specify path to save results (model and plots)
results_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/results/'

# Size of the input images and number of epoches for training
size = 128
num_epoch = 1000

# Define generators for training, validation, and anomaly data.
batch_size = 64
datagen = ImageDataGenerator(rescale=1./255)

# path to training PSD plots (seen data)
train_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/plots/train/'
num_train_data = 768
train_generator = datagen.flow_from_directory(
train_path,
target_size=(size, size),
batch_size=batch_size,
class_mode='input'
)

# path to testing PSD plots (unseen data)
test_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/plots/test/'
num_test_data = 192
validation_generator = datagen.flow_from_directory(
test_path,
target_size=(size, size),
batch_size=batch_size,
class_mode='input'
)

# path to known seismic events
events_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/seismic_events/plots/obvious_seismic_events/'
anomaly_generator = datagen.flow_from_directory(
events_path,
target_size=(size, size),
batch_size=batch_size,
class_mode='input'
)

# Define the autoencoder.
# Encoder
model = Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3)))
model.add(MaxPooling2D((2, 2), padding='same'))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))
model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))

# Decoder
model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))

model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
model.summary()

# Fit the model.
history = model.fit(
train_generator,
steps_per_epoch= num_train_data // batch_size,
epochs=num_epoch,
validation_data=validation_generator,
validation_steps=num_test_data // batch_size,
shuffle = True)

# Save the model in h5 format
model.save(results_path + 'model_1_128.h5')

# Save the history as well
history_dict = history.history
history_json = json.dumps(history_dict)
with open(results_path + 'history_1_128.json', 'w') as json_file:
json_file.write(history_json)

# Plot the training and validation accuracy and loss at each epoch
plot_train_test_loss(history, results_path)
Loading

0 comments on commit daacd35

Please sign in to comment.