first

ahmadtourei · Jul 3, 2024 · daacd35 · daacd35
1 parent 217f332
commit daacd35
Show file tree

Hide file tree

Showing 12 changed files with 935 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,2 +1,11 @@
 # das-anomaly
-A Python package for an autoencoder-based algorithm to detect anomalies in distributed acoustic sensing (DAS) datasets.
+Python scripts for an autoencoder-based algorithm to detect anomalies in distributed acoustic sensing (DAS) datasets.
+
+The main steps are as follows:
+1. Plot power spectral density (PSD) plots in RGB format. We average the energy over a desired time window and stack all channels together to create a PSD with channels on the X-axis and frequency on the Y-axis. We create PSD of normal images (images without any anomaly or seismic event) and known seismic events. We can use MPI to distribute plotting PSDs over CPUs. 
+2. Train the model on normal PSD images. 
+3. Use the trained model to detect anomalies in PSDs.
+4. Count number of detected anomalies (if needed).
+
+Contact: Ahmad Tourei, Colorado School of Mines
+tourei@mines.edu | ahmadtourei@gmail.com
diff --git a/source/__pycache__/utils.cpython-311.pyc b/source/__pycache__/utils.cpython-311.pyc
diff --git a/source/count_anomalies/count_anomalies.py b/source/count_anomalies/count_anomalies.py
@@ -0,0 +1,26 @@
+"""
+Count number of anomalies from the result (saved) text files.
+"""
+import os
+import sys
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+source_dir = os.path.join(current_dir, '..')
+sys.path.append(source_dir)
+from utils import search_keyword_in_files
+
+
+root = '/globalscratch/ahmad9/caserm/spectrum_analysis/results/' 
+result_folder = 'UTC-YMD20220609-HMS124917.291'
+
+keyword = 'anomaly'  
+
+directory = os.path.join(root, result_folder)
+
+total_count, lines = search_keyword_in_files(directory, keyword)
+
+with open(root + 'anomalies_' + result_folder + '.txt', 'w') as file:
+    for line in lines:
+        file.write(f"{line}\n")
+
+print(f"Total occurrences of the '{keyword}':", total_count)
diff --git a/source/count_anomalies/count_anomalies.sh b/source/count_anomalies/count_anomalies.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH -t 00:10:00
+#SBATCH -A casermminefiber
+#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_count_anomaly_%j.out
+#SBATCH --mem-per-cpu=16G
+
+# Print start time
+echo "Job started at: $(date)"
+
+source activate casermml
+python count_anomalies.py
+
+# Print end time
+echo "Job ended at: $(date)"
diff --git a/source/detect_anomalies/detect_anomalies.py b/source/detect_anomalies/detect_anomalies.py
@@ -0,0 +1,113 @@
+"""
+Use the train model to detect anomalies (potential seismic events.)
+"""
+import glob
+import json
+import numpy as np
+import os
+import sys
+
+from keras.models import load_model
+from sklearn.neighbors import KernelDensity
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+source_dir = os.path.join(current_dir, '..')
+sys.path.append(source_dir)
+from utils import density, check_if_anomaly
+
+
+# Size of the input images 
+size = 128
+
+# Define the path to the results 
+results_path = '/globalscratch/ahmad9/caserm/spectrum_analysis/results/'
+model_path = results_path + f'model_1_{size}.h5'
+loaded_model = load_model(model_path)
+
+# Define the path to power spectral density (PSD) plots 
+data_spool = 'UTC-YMD20220617-HMS155316.989'
+root_dir = f'/globalscratch/ahmad9/caserm/spectrum_analysis/spectrum_plots/{data_spool}/'
+
+# Define your desired threshold for density score
+density_threshold = 15934.15
+
+# Define generators for training, validation and, anomaly data.
+batch_size = 64
+datagen = ImageDataGenerator(rescale=1./255)
+
+# Create the train generator (with same parameters as for the trained model)
+train_generator = datagen.flow_from_directory(
+    "",
+    target_size=(size, size),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+
+# Define the autoencoder
+# Encoder
+model = Sequential()
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3)))
+model.add(MaxPooling2D((2, 2), padding='same'))
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(MaxPooling2D((2, 2), padding='same'))
+model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+model.add(MaxPooling2D((2, 2), padding='same'))
+
+# Decoder
+model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+
+model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))
+
+model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
+
+# Read the history of the trained model 
+with open(os.path.join(results_path, 'history_1_128.json'), 'r') as json_file:
+    history_dict = json.load(json_file)
+
+# Extract the encoder network, with trained weights
+encoder_model = Sequential()
+# Add the convolutional layer without weights
+encoder_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3)))
+# Set the weights from the corresponding layer of the loaded model
+encoder_model.layers[-1].set_weights(loaded_model.layers[0].get_weights())
+encoder_model.add(MaxPooling2D((2, 2), padding='same'))
+encoder_model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+encoder_model.layers[-1].set_weights(loaded_model.layers[2].get_weights())
+encoder_model.add(MaxPooling2D((2, 2), padding='same'))
+encoder_model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+encoder_model.layers[-1].set_weights(loaded_model.layers[4].get_weights())
+encoder_model.add(MaxPooling2D((2, 2), padding='same'))
+
+# Calculate KDE of latent space using sklearn and determine if PSD is an anomaly
+encoded_images = encoder_model.predict(train_generator)
+encoder_output_shape = encoder_model.output_shape 
+out_vector_shape = encoder_output_shape[1]*encoder_output_shape[2]*encoder_output_shape[3]
+
+encoded_images_vector = [np.reshape(img, (out_vector_shape)) for img in encoded_images]
+
+# Fit KDE to the image latent data
+kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_vector)
+
+# Write whether the image is an anomaly in a text file
+for folder_name in os.listdir(root_dir):
+    folder_path = os.path.join(root_dir, folder_name)
+    if os.path.isdir(folder_path):
+        # Construct the glob pattern for PSD file paths in the current folder
+        spectrum_file_pattern = os.path.join(folder_path, '*')
+        spectrum_file_paths = glob.glob(spectrum_file_pattern)
+    # SSave the the results in a text file 
+    with open(results_path + data_spool + '/' + str(folder_name) + '_output_model_1_128_anomaly.txt', 'w') as file:
+        for i in range (0,len(spectrum_file_paths)):       
+            print(
+                f"Line {i}, image {spectrum_file_paths[i]}: 
+                  {check_if_anomaly(encoder_model=encoder_model, size=size, img_path=spectrum_file_paths[i], density_threshold=density_threshold, kde=kde)}", 
+                  file=file
+                  )
diff --git a/source/detect_anomalies/detect_anomalies.sh b/source/detect_anomalies/detect_anomalies.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH -t 36:00:00
+#SBATCH -A casermminefiber
+#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_event_detection_%j.out
+#SBATCH --mem-per-cpu=128G
+
+# Print start time
+echo "Job started at: $(date)"
+
+source activate casermml
+python detect_anomalies.py
+
+# Print end time
+echo "Job ended at: $(date)"
diff --git a/source/plot_psd/psd_MPI.py b/source/plot_psd/psd_MPI.py
@@ -0,0 +1,89 @@
+"""
+Plot power spectral density (PSD) plots using MPI.
+"""
+import numpy as np
+import os
+import sys
+import time
+
+import dascore as dc
+from mpi4py import MPI
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+source_dir = os.path.join(current_dir, '..')
+sys.path.append(source_dir)
+from utils import plot_spec
+
+
+if not sys.warnoptions:
+    import warnings
+    warnings.simplefilter("ignore")
+os.system('set HDF5_USE_FILE=FALSE')
+
+# Intials
+t0 = time.time()
+data_path = '/projects/casermminefiber/UTC-YMD20220617-HMS155316.989/'
+fig_path = '/globalscratch/ahmad9/caserm/spectrum_analysis/spectrum_plots/UTC-YMD20220617-HMS155316.989/'
+
+# Initiate MPI
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+size = comm.Get_size()
+
+if rank==0:
+    sp = dc.spool(data_path).update()
+    num_patch = len(sp)
+
+    print("Number of patches in the spool: ", num_patch)
+
+    splits = num_patch
+    if splits<1:
+        raise ValueError('No patch of DAS data found within data path: %s'%(data_path))
+
+    patch = sp[0]
+    time_step = patch.coords.step("time")
+    sampling_rate = int(1/(time_step / np.timedelta64(1, 's')))
+    num_sec = patch.seconds
+    distance_step = round(patch.coords.step("distance"),5) 
+else:
+    splits = sp = sampling_rate = num_sec = distance_step = None
+
+# Broadcast the variables
+splits = comm.bcast(splits, root=0)
+sp = comm.bcast(sp, root=0)
+sampling_rate = comm.bcast(sampling_rate, root=0)
+num_sec = comm.bcast(num_sec, root=0)
+distance_step = comm.bcast(distance_step, root=0)
+
+# Set parameters for preprocessing the data
+gauge_length = distance_step*2
+start_channel = 100
+end_channel = 375
+min_freq = 0
+max_freq = 0.9*0.5*sampling_rate
+time_window = 2 # sec
+time_overlap = 1 # sec
+
+# Loop over files (MPI)
+for i in range(rank,splits,size):
+  print(f"wroking on patch number: {i}")
+  patch = sp[i].transpose("time", "distance")
+  das_data = patch.data
+  # Velocity to strain rate
+  gauge_samples = int(round(gauge_length / distance_step))
+  strain_rate = das_data[:, gauge_samples:] - das_data[:, :-gauge_samples]
+  strain_rate = strain_rate / gauge_length
+
+  num_digits = len(str(int(num_sec)))
+
+  # Loop over time windows and plot the PSDs
+  for j in range(0,int(num_sec),time_overlap):
+    start_time = j
+    end_time = start_time + time_window
+    if end_time>num_sec:  
+      continue
+    title = os.path.splitext(str(sp.get_contents()["path"][i]))[0]+'_'+str(start_time).zfill(num_digits)+'-'+str(end_time).zfill(num_digits)+'sec'
+    output_rank = str(rank).zfill(int(len(str(int(size)))))
+    plot_spec(strain_rate, start_time, end_time, start_channel, end_channel, min_freq, max_freq, sampling_rate, title, output_rank, fig_path)
+
+sys.exit()
diff --git a/source/plot_psd/psd_MPI.sh b/source/plot_psd/psd_MPI.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH --ntasks=250
+#SBATCH -t 12:00:00
+#SBATCH -A casermminefiber
+#SBATCH --output=/globalscratch/ahmad9/caserm/spectrum_analysis/outputs/output_MPI_%j.out
+#SBATCH --mem-per-cpu=16G
+
+# Print start time
+echo "Job started at: $(date)"
+
+module load openmpi/gcc/64/4.0.4
+source activate casermmpi
+mpirun -n ${SLURM_NTASKS} python spectrum_MPI.py
+
+# Print end time
+echo "Job ended at: $(date)"
diff --git a/source/train_model/train_ae.py b/source/train_model/train_ae.py
@@ -0,0 +1,105 @@
+"""
+This script uses unsupervised autoencoders to train a deep learning model for anomaly detection in images.
+
+For microseismic event detection, images can be the power spectral density (PSD) plots. 
+
+We will consider the bottleneck layer output from our autoencoder as the latent space. 
+Using the reconstruction error and kernel density estimation (KDE) based on the vectors in the latent space, anomalies can be detected.  
+"""
+import json
+import os
+import sys
+
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+source_dir = os.path.join(current_dir, '..')
+sys.path.append(source_dir)
+from utils import plot_train_test_loss
+
+
+# Specify path to save results (model and plots)
+results_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/results/'
+
+# Size of the input images and number of epoches for training
+size = 128
+num_epoch = 1000
+
+# Define generators for training, validation, and anomaly data.
+batch_size = 64
+datagen = ImageDataGenerator(rescale=1./255)
+
+# path to training PSD plots (seen data)
+train_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/plots/train/'
+num_train_data = 768
+train_generator = datagen.flow_from_directory(
+    train_path,
+    target_size=(size, size),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+
+# path to testing PSD plots (unseen data)
+test_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/background_noise/plots/test/'
+num_test_data = 192
+validation_generator = datagen.flow_from_directory(
+    test_path,
+    target_size=(size, size),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+
+# path to known seismic events
+events_path = '/u/pa/nb/tourei/scratch/caserm/spectrum_analysis/seismic_events/plots/obvious_seismic_events/'
+anomaly_generator = datagen.flow_from_directory(
+    events_path,
+    target_size=(size, size),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+
+# Define the autoencoder. 
+# Encoder
+model = Sequential()
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(size, size, 3)))
+model.add(MaxPooling2D((2, 2), padding='same'))
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(MaxPooling2D((2, 2), padding='same'))
+model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+model.add(MaxPooling2D((2, 2), padding='same'))
+
+# Decoder
+model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+
+model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))
+
+model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
+model.summary()
+
+# Fit the model. 
+history = model.fit(
+        train_generator,
+        steps_per_epoch= num_train_data // batch_size,
+        epochs=num_epoch,
+        validation_data=validation_generator,
+        validation_steps=num_test_data // batch_size,
+        shuffle = True)
+
+# Save the model in h5 format
+model.save(results_path + 'model_1_128.h5')  
+
+# Save the history as well
+history_dict = history.history
+history_json = json.dumps(history_dict)
+with open(results_path + 'history_1_128.json', 'w') as json_file:
+    json_file.write(history_json)
+
+# Plot the training and validation accuracy and loss at each epoch
+plot_train_test_loss(history, results_path)