-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
ahmadtourei
committed
Jul 3, 2024
1 parent
217f332
commit daacd35
Showing
12 changed files
with
935 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,11 @@ | ||
# das-anomaly | ||
A Python package for an autoencoder-based algorithm to detect anomalies in distributed acoustic sensing (DAS) datasets. | ||
Python scripts for an autoencoder-based algorithm to detect anomalies in distributed acoustic sensing (DAS) datasets. | ||
|
||
The main steps are as follows: | ||
1. Plot power spectral density (PSD) plots in RGB format. We average the energy over a desired time window and stack all channels together to create a PSD with channels on the X-axis and frequency on the Y-axis. We create PSD of normal images (images without any anomaly or seismic event) and known seismic events. We can use MPI to distribute plotting PSDs over CPUs. | ||
2. Train the model on normal PSD images. | ||
3. Use the trained model to detect anomalies in PSDs. | ||
4. Count number of detected anomalies (if needed). | ||
|
||
Contact: Ahmad Tourei, Colorado School of Mines | ||
tourei@mines.edu | ahmadtourei@gmail.com |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
""" | ||
Count number of anomalies from the result (saved) text files. | ||
""" | ||
import os | ||
import sys | ||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
source_dir = os.path.join(current_dir, '..') | ||
sys.path.append(source_dir) | ||
from utils import search_keyword_in_files | ||
|
||
|
||
root = '/globalscratch/ahmad9/caserm/spectrum_analysis/results/' | ||
result_folder = 'UTC-YMD20220609-HMS124917.291' | ||
|
||
keyword = 'anomaly' | ||
|
||
directory = os.path.join(root, result_folder) | ||
|
||
total_count, lines = search_keyword_in_files(directory, keyword) | ||
|
||
with open(root + 'anomalies_' + result_folder + '.txt', 'w') as file: | ||
for line in lines: | ||
file.write(f"{line}\n") | ||
|
||
print(f"Total occurrences of the '{keyword}':", total_count) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
#SBATCH -t 00:10:00 | ||
#SBATCH -A casermminefiber | ||
#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_count_anomaly_%j.out | ||
#SBATCH --mem-per-cpu=16G | ||
|
||
# Print start time | ||
echo "Job started at: $(date)" | ||
|
||
source activate casermml | ||
python count_anomalies.py | ||
|
||
# Print end time | ||
echo "Job ended at: $(date)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
""" | ||
Use the train model to detect anomalies (potential seismic events.) | ||
""" | ||
import glob | ||
import json | ||
import numpy as np | ||
import os | ||
import sys | ||
|
||
from keras.models import load_model | ||
from sklearn.neighbors import KernelDensity | ||
from tensorflow.keras.models import Sequential | ||
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D | ||
from tensorflow.keras.preprocessing.image import ImageDataGenerator | ||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
source_dir = os.path.join(current_dir, '..') | ||
sys.path.append(source_dir) | ||
from utils import density, check_if_anomaly | ||
|
||
|
||
# Size of the input images | ||
size = 128 | ||
|
||
# Define the path to the results | ||
results_path = '/globalscratch/ahmad9/caserm/spectrum_analysis/results/' | ||
model_path = results_path + f'model_1_{size}.h5' | ||
loaded_model = load_model(model_path) | ||
|
||
# Define the path to power spectral density (PSD) plots | ||
data_spool = 'UTC-YMD20220617-HMS155316.989' | ||
root_dir = f'/globalscratch/ahmad9/caserm/spectrum_analysis/spectrum_plots/{data_spool}/' | ||
|
||
# Define your desired threshold for density score | ||
density_threshold = 15934.15 | ||
|
||
# Define generators for training, validation and, anomaly data. | ||
batch_size = 64 | ||
datagen = ImageDataGenerator(rescale=1./255) | ||
|
||
# Create the train generator (with same parameters as for the trained model) | ||
train_generator = datagen.flow_from_directory( | ||
"", | ||
target_size=(size, size), | ||
batch_size=batch_size, | ||
class_mode='input' | ||
) | ||
|
||
# Define the autoencoder | ||
# Encoder | ||
model = Sequential() | ||
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3))) | ||
model.add(MaxPooling2D((2, 2), padding='same')) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) | ||
model.add(MaxPooling2D((2, 2), padding='same')) | ||
model.add(Conv2D(16, (3, 3), activation='relu', padding='same')) | ||
model.add(MaxPooling2D((2, 2), padding='same')) | ||
|
||
# Decoder | ||
model.add(Conv2D(16, (3, 3), activation='relu', padding='same')) | ||
model.add(UpSampling2D((2, 2))) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) | ||
model.add(UpSampling2D((2, 2))) | ||
model.add(Conv2D(64, (3, 3), activation='relu', padding='same')) | ||
model.add(UpSampling2D((2, 2))) | ||
|
||
model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same')) | ||
|
||
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse']) | ||
|
||
# Read the history of the trained model | ||
with open(os.path.join(results_path, 'history_1_128.json'), 'r') as json_file: | ||
history_dict = json.load(json_file) | ||
|
||
# Extract the encoder network, with trained weights | ||
encoder_model = Sequential() | ||
# Add the convolutional layer without weights | ||
encoder_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3))) | ||
# Set the weights from the corresponding layer of the loaded model | ||
encoder_model.layers[-1].set_weights(loaded_model.layers[0].get_weights()) | ||
encoder_model.add(MaxPooling2D((2, 2), padding='same')) | ||
encoder_model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) | ||
encoder_model.layers[-1].set_weights(loaded_model.layers[2].get_weights()) | ||
encoder_model.add(MaxPooling2D((2, 2), padding='same')) | ||
encoder_model.add(Conv2D(16, (3, 3), activation='relu', padding='same')) | ||
encoder_model.layers[-1].set_weights(loaded_model.layers[4].get_weights()) | ||
encoder_model.add(MaxPooling2D((2, 2), padding='same')) | ||
|
||
# Calculate KDE of latent space using sklearn and determine if PSD is an anomaly | ||
encoded_images = encoder_model.predict(train_generator) | ||
encoder_output_shape = encoder_model.output_shape | ||
out_vector_shape = encoder_output_shape[1]*encoder_output_shape[2]*encoder_output_shape[3] | ||
|
||
encoded_images_vector = [np.reshape(img, (out_vector_shape)) for img in encoded_images] | ||
|
||
# Fit KDE to the image latent data | ||
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_vector) | ||
|
||
# Write whether the image is an anomaly in a text file | ||
for folder_name in os.listdir(root_dir): | ||
folder_path = os.path.join(root_dir, folder_name) | ||
if os.path.isdir(folder_path): | ||
# Construct the glob pattern for PSD file paths in the current folder | ||
spectrum_file_pattern = os.path.join(folder_path, '*') | ||
spectrum_file_paths = glob.glob(spectrum_file_pattern) | ||
# SSave the the results in a text file | ||
with open(results_path + data_spool + '/' + str(folder_name) + '_output_model_1_128_anomaly.txt', 'w') as file: | ||
for i in range (0,len(spectrum_file_paths)): | ||
print( | ||
f"Line {i}, image {spectrum_file_paths[i]}: | ||
{check_if_anomaly(encoder_model=encoder_model, size=size, img_path=spectrum_file_paths[i], density_threshold=density_threshold, kde=kde)}", | ||
file=file | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
#SBATCH -t 36:00:00 | ||
#SBATCH -A casermminefiber | ||
#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_event_detection_%j.out | ||
#SBATCH --mem-per-cpu=128G | ||
|
||
# Print start time | ||
echo "Job started at: $(date)" | ||
|
||
source activate casermml | ||
python detect_anomalies.py | ||
|
||
# Print end time | ||
echo "Job ended at: $(date)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
""" | ||
Plot power spectral density (PSD) plots using MPI. | ||
""" | ||
import numpy as np | ||
import os | ||
import sys | ||
import time | ||
|
||
import dascore as dc | ||
from mpi4py import MPI | ||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
source_dir = os.path.join(current_dir, '..') | ||
sys.path.append(source_dir) | ||
from utils import plot_spec | ||
|
||
|
||
if not sys.warnoptions: | ||
import warnings | ||
warnings.simplefilter("ignore") | ||
os.system('set HDF5_USE_FILE=FALSE') | ||
|
||
# Intials | ||
t0 = time.time() | ||
data_path = '/projects/casermminefiber/UTC-YMD20220617-HMS155316.989/' | ||
fig_path = '/globalscratch/ahmad9/caserm/spectrum_analysis/spectrum_plots/UTC-YMD20220617-HMS155316.989/' | ||
|
||
# Initiate MPI | ||
comm = MPI.COMM_WORLD | ||
rank = comm.Get_rank() | ||
size = comm.Get_size() | ||
|
||
if rank==0: | ||
sp = dc.spool(data_path).update() | ||
num_patch = len(sp) | ||
|
||
print("Number of patches in the spool: ", num_patch) | ||
|
||
splits = num_patch | ||
if splits<1: | ||
raise ValueError('No patch of DAS data found within data path: %s'%(data_path)) | ||
|
||
patch = sp[0] | ||
time_step = patch.coords.step("time") | ||
sampling_rate = int(1/(time_step / np.timedelta64(1, 's'))) | ||
num_sec = patch.seconds | ||
distance_step = round(patch.coords.step("distance"),5) | ||
else: | ||
splits = sp = sampling_rate = num_sec = distance_step = None | ||
|
||
# Broadcast the variables | ||
splits = comm.bcast(splits, root=0) | ||
sp = comm.bcast(sp, root=0) | ||
sampling_rate = comm.bcast(sampling_rate, root=0) | ||
num_sec = comm.bcast(num_sec, root=0) | ||
distance_step = comm.bcast(distance_step, root=0) | ||
|
||
# Set parameters for preprocessing the data | ||
gauge_length = distance_step*2 | ||
start_channel = 100 | ||
end_channel = 375 | ||
min_freq = 0 | ||
max_freq = 0.9*0.5*sampling_rate | ||
time_window = 2 # sec | ||
time_overlap = 1 # sec | ||
|
||
# Loop over files (MPI) | ||
for i in range(rank,splits,size): | ||
print(f"wroking on patch number: {i}") | ||
patch = sp[i].transpose("time", "distance") | ||
das_data = patch.data | ||
# Velocity to strain rate | ||
gauge_samples = int(round(gauge_length / distance_step)) | ||
strain_rate = das_data[:, gauge_samples:] - das_data[:, :-gauge_samples] | ||
strain_rate = strain_rate / gauge_length | ||
|
||
num_digits = len(str(int(num_sec))) | ||
|
||
# Loop over time windows and plot the PSDs | ||
for j in range(0,int(num_sec),time_overlap): | ||
start_time = j | ||
end_time = start_time + time_window | ||
if end_time>num_sec: | ||
continue | ||
title = os.path.splitext(str(sp.get_contents()["path"][i]))[0]+'_'+str(start_time).zfill(num_digits)+'-'+str(end_time).zfill(num_digits)+'sec' | ||
output_rank = str(rank).zfill(int(len(str(int(size))))) | ||
plot_spec(strain_rate, start_time, end_time, start_channel, end_channel, min_freq, max_freq, sampling_rate, title, output_rank, fig_path) | ||
|
||
sys.exit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
#SBATCH --ntasks=250 | ||
#SBATCH -t 12:00:00 | ||
#SBATCH -A casermminefiber | ||
#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_MPI_%j.out | ||
#SBATCH --mem-per-cpu=16G | ||
|
||
# Print start time | ||
echo "Job started at: $(date)" | ||
|
||
module load openmpi/gcc/64/4.0.4 | ||
source activate casermmpi | ||
mpirun -n ${SLURM_NTASKS} python spectrum_MPI.py | ||
|
||
# Print end time | ||
echo "Job ended at: $(date)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
""" | ||
This script uses unsupervised autoencoders to train a deep learning model for anomaly detection in images. | ||
For microseismic event detection, images can be the power spectral density (PSD) plots. | ||
We will consider the bottleneck layer output from our autoencoder as the latent space. | ||
Using the reconstruction error and kernel density estimation (KDE) based on the vectors in the latent space, anomalies can be detected. | ||
""" | ||
import json | ||
import os | ||
import sys | ||
|
||
from tensorflow.keras.models import Sequential | ||
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D | ||
from tensorflow.keras.preprocessing.image import ImageDataGenerator | ||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
source_dir = os.path.join(current_dir, '..') | ||
sys.path.append(source_dir) | ||
from utils import plot_train_test_loss | ||
|
||
|
||
# Specify path to save results (model and plots) | ||
results_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/results/' | ||
|
||
# Size of the input images and number of epoches for training | ||
size = 128 | ||
num_epoch = 1000 | ||
|
||
# Define generators for training, validation, and anomaly data. | ||
batch_size = 64 | ||
datagen = ImageDataGenerator(rescale=1./255) | ||
|
||
# path to training PSD plots (seen data) | ||
train_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/plots/train/' | ||
num_train_data = 768 | ||
train_generator = datagen.flow_from_directory( | ||
train_path, | ||
target_size=(size, size), | ||
batch_size=batch_size, | ||
class_mode='input' | ||
) | ||
|
||
# path to testing PSD plots (unseen data) | ||
test_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/plots/test/' | ||
num_test_data = 192 | ||
validation_generator = datagen.flow_from_directory( | ||
test_path, | ||
target_size=(size, size), | ||
batch_size=batch_size, | ||
class_mode='input' | ||
) | ||
|
||
# path to known seismic events | ||
events_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/seismic_events/plots/obvious_seismic_events/' | ||
anomaly_generator = datagen.flow_from_directory( | ||
events_path, | ||
target_size=(size, size), | ||
batch_size=batch_size, | ||
class_mode='input' | ||
) | ||
|
||
# Define the autoencoder. | ||
# Encoder | ||
model = Sequential() | ||
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3))) | ||
model.add(MaxPooling2D((2, 2), padding='same')) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) | ||
model.add(MaxPooling2D((2, 2), padding='same')) | ||
model.add(Conv2D(16, (3, 3), activation='relu', padding='same')) | ||
model.add(MaxPooling2D((2, 2), padding='same')) | ||
|
||
# Decoder | ||
model.add(Conv2D(16, (3, 3), activation='relu', padding='same')) | ||
model.add(UpSampling2D((2, 2))) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) | ||
model.add(UpSampling2D((2, 2))) | ||
model.add(Conv2D(64, (3, 3), activation='relu', padding='same')) | ||
model.add(UpSampling2D((2, 2))) | ||
|
||
model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same')) | ||
|
||
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse']) | ||
model.summary() | ||
|
||
# Fit the model. | ||
history = model.fit( | ||
train_generator, | ||
steps_per_epoch= num_train_data // batch_size, | ||
epochs=num_epoch, | ||
validation_data=validation_generator, | ||
validation_steps=num_test_data // batch_size, | ||
shuffle = True) | ||
|
||
# Save the model in h5 format | ||
model.save(results_path + 'model_1_128.h5') | ||
|
||
# Save the history as well | ||
history_dict = history.history | ||
history_json = json.dumps(history_dict) | ||
with open(results_path + 'history_1_128.json', 'w') as json_file: | ||
json_file.write(history_json) | ||
|
||
# Plot the training and validation accuracy and loss at each epoch | ||
plot_train_test_loss(history, results_path) |
Oops, something went wrong.