利用注意力机制的CNN进行DNA序列分类与解释

In this tutorial, we take a hands-on approach to building an advanced convolutional neural network for DNA sequence classification. We focus on simulating real biological tasks, such as promoter prediction, splice site detection, and regulatory element identification. By combining one-hot encoding, multi-scale convolutional layers, and an attention mechanism, we design a model that not only learns complex motifs but also provides interpretability. As we progress, we generate synthetic data, train with robust callbacks, and visualize results to ensure we fully understand the strengths and limitations of our approach. Check out the FULL CODES here.

Copy CodeCopiedUse a different Browser

import numpy as npimport tensorflow as tffrom tensorflow import kerasfrom tensorflow.keras import layersimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import classification_report, confusion_matriximport seaborn as snsimport randomnp.random.seed(42)tf.random.set_seed(42)random.seed(42)

We begin by importing the libraries for deep learning, data handling, and visualization. We set random seeds to ensure reproducibility so that our experiments run consistently each time. Check out the FULL CODES here.

Copy CodeCopiedUse a different Browser

class DNASequenceClassifier:   def __init__(self, sequence_length=200, num_classes=2):       self.sequence_length = sequence_length       self.num_classes = num_classes       self.model = None       self.history = None         def one_hot_encode(self, sequences):       mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}       encoded = np.zeros((len(sequences), self.sequence_length, 4))             for i, seq in enumerate(sequences):           for j, nucleotide in enumerate(seq[:self.sequence_length]):               if nucleotide in mapping:                   encoded[i, j, mapping[nucleotide]] = 1       return encoded     def attention_layer(self, inputs, name="attention"):       attention_weights = layers.Dense(1, activation='tanh', name=f"{name}_weights")(inputs)       attention_weights = layers.Flatten()(attention_weights)       attention_weights = layers.Activation('softmax', name=f"{name}_softmax")(attention_weights)       attention_weights = layers.RepeatVector(inputs.shape[-1])(attention_weights)       attention_weights = layers.Permute([2, 1])(attention_weights)             attended = layers.Multiply(name=f"{name}_multiply")([inputs, attention_weights])       return layers.GlobalMaxPooling1D()(attended)     def build_model(self):       inputs = layers.Input(shape=(self.sequence_length, 4), name="dna_input")             conv_layers = []       filter_sizes = [3, 7, 15, 25]             for i, filter_size in enumerate(filter_sizes):           conv = layers.Conv1D(               filters=64,               kernel_size=filter_size,               activation='relu',               padding='same',               name=f"conv_{filter_size}"           )(inputs)           conv = layers.BatchNormalization(name=f"bn_conv_{filter_size}")(conv)           conv = layers.Dropout(0.2, name=f"dropout_conv_{filter_size}")(conv)                     attended = self.attention_layer(conv, name=f"attention_{filter_size}")           conv_layers.append(attended)             if len(conv_layers) > 1:           merged = layers.Concatenate(name="concat_multiscale")(conv_layers)       else:           merged = conv_layers[0]             dense = layers.Dense(256, activation='relu', name="dense_1")(merged)       dense = layers.BatchNormalization(name="bn_dense_1")(dense)       dense = layers.Dropout(0.5, name="dropout_dense_1")(dense)             dense = layers.Dense(128, activation='relu', name="dense_2")(dense)       dense = layers.BatchNormalization(name="bn_dense_2")(dense)       dense = layers.Dropout(0.3, name="dropout_dense_2")(dense)             if self.num_classes == 2:           outputs = layers.Dense(1, activation='sigmoid', name="output")(dense)           loss = 'binary_crossentropy'           metrics = ['accuracy', 'precision', 'recall']       else:           outputs = layers.Dense(self.num_classes, activation='softmax', name="output")(dense)           loss = 'categorical_crossentropy'           metrics = ['accuracy']             self.model = keras.Model(inputs=inputs, outputs=outputs, name="DNA_CNN_Classifier")             optimizer = keras.optimizers.Adam(           learning_rate=0.001,           beta_1=0.9,           beta_2=0.999,           epsilon=1e-7       )             self.model.compile(           optimizer=optimizer,           loss=loss,           metrics=metrics       )             return self.model     def generate_synthetic_data(self, n_samples=10000):       sequences = []       labels = []             positive_motifs = ['TATAAA', 'CAAT', 'GGGCGG', 'TTGACA']       negative_motifs = ['AAAAAAA', 'TTTTTTT', 'CCCCCCC', 'GGGGGGG']             nucleotides = ['A', 'T', 'G', 'C']             for i in range(n_samples):           sequence = ''.join(random.choices(nucleotides, k=self.sequence_length))                     if i < n_samples // 2:               motif = random.choice(positive_motifs)               pos = random.randint(0, self.sequence_length - len(motif))               sequence = sequence[:pos] + motif + sequence[pos + len(motif):]               label = 1           else:               if random.random() < 0.3:                   motif = random.choice(negative_motifs)                   pos = random.randint(0, self.sequence_length - len(motif))                   sequence = sequence[:pos] + motif + sequence[pos + len(motif):]               label = 0                     sequences.append(sequence)           labels.append(label)             return sequences, np.array(labels)     def train(self, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):       callbacks = [           keras.callbacks.EarlyStopping(               monitor='val_loss',               patience=10,               restore_best_weights=True           ),           keras.callbacks.ReduceLROnPlateau(               monitor='val_loss',               factor=0.5,               patience=5,               min_lr=1e-6           )       ]             self.history = self.model.fit(           X_train, y_train,           validation_data=(X_val, y_val),           epochs=epochs,           batch_size=batch_size,           callbacks=callbacks,           verbose=1       )             return self.history     def evaluate_and_visualize(self, X_test, y_test):       y_pred_proba = self.model.predict(X_test)       y_pred = (y_pred_proba > 0.5).astype(int).flatten()             print("Classification Report:")       print(classification_report(y_test, y_pred))             fig, axes = plt.subplots(2, 2, figsize=(15, 10))             axes[0,0].plot(self.history.history['loss'], label='Training Loss')       axes[0,0].plot(self.history.history['val_loss'], label='Validation Loss')       axes[0,0].set_title('Training History - Loss')       axes[0,0].set_xlabel('Epoch')       axes[0,0].set_ylabel('Loss')       axes[0,0].legend()             axes[0,1].plot(self.history.history['accuracy'], label='Training Accuracy')       axes[0,1].plot(self.history.history['val_accuracy'], label='Validation Accuracy')       axes[0,1].set_title('Training History - Accuracy')       axes[0,1].set_xlabel('Epoch')       axes[0,1].set_ylabel('Accuracy')       axes[0,1].legend()             cm = confusion_matrix(y_test, y_pred)       sns.heatmap(cm, annot=True, fmt='d', ax=axes[1,0], cmap='Blues')       axes[1,0].set_title('Confusion Matrix')       axes[1,0].set_ylabel('Actual')       axes[1,0].set_xlabel('Predicted')             axes[1,1].hist(y_pred_proba[y_test==0], bins=50, alpha=0.7, label='Negative', density=True)       axes[1,1].hist(y_pred_proba[y_test==1], bins=50, alpha=0.7, label='Positive', density=True)       axes[1,1].set_title('Prediction Score Distribution')       axes[1,1].set_xlabel('Prediction Score')       axes[1,1].set_ylabel('Density')       axes[1,1].legend()             plt.tight_layout()       plt.show()             return y_pred, y_pred_proba

We define a DNASequenceClassifier that encodes sequences, learns multi-scale motifs with CNNs, and applies an attention mechanism for interpretability. We build and compile the model, generate synthetic motif-rich data, and then train with robust callbacks and visualize performance to evaluate classification quality. Check out the FULL CODES here.

Copy CodeCopiedUse a different Browser

def main():   print(" Advanced DNA Sequence Classification with CNN")   print("=" * 50)     classifier = DNASequenceClassifier(sequence_length=200, num_classes=2)     print("Generating synthetic DNA sequences...")   sequences, labels = classifier.generate_synthetic_data(n_samples=10000)     print("Encoding DNA sequences...")   X = classifier.one_hot_encode(sequences)     X_trn, X_test, y_trn, y_test = train_test_split(       X, labels, test_size=0.2, random_state=42, stratify=labels   )   X_trn, X_val, y_trn, y_val = train_test_split(       X_trn, y_trn, test_size=0.2, random_state=42, stratify=y_train   )     print(f"Training set: {X_train.shape}")   print(f"Validation set: {X_val.shape}")   print(f"Test set: {X_test.shape}")     print("Building CNN model...")   model = classifier.build_model()   print(model.summary())     print("Training model...")   classifier.train(X_train, y_train, X_val, y_val, epochs=30, batch_size=64)     print("Evaluating model...")   y_pred, y_pred_proba = classifier.evaluate_and_visualize(X_test, y_test)     print(" Training and evaluation complete!")if __name__ == "__main__":   main()

We wrap up the workflow in the main() function, where we generate synthetic DNA data, encode it, split it into training, validation, and test sets, then build, train, and evaluate our CNN model. We conclude by visualizing the performance and confirming that the classification pipeline runs successfully from start to finish.

In conclusion, we successfully demonstrate how a carefully designed CNN with attention can classify DNA sequences with high accuracy and interpretability. We see how synthetic biological motifs help validate the model’s capacity for pattern recognition, and how visualization techniques provide meaningful insights into training dynamics and predictions. Through this journey, we enhance our ability to integrate deep learning architectures with biological data, laying the groundwork for applying these methods to real-world genomics research.

Check out the FULL CODES here. Feel free to check out our GitHub Page for Tutorials, Codes and Notebooks. Also, feel free to follow us on Twitter and don’t forget to join our 100k+ ML SubReddit and Subscribe to our Newsletter.

The post Building an Advanced Convolutional Neural Network with Attention for DNA Sequence Classification and Interpretability appeared first on MarkTechPost.

Fish AI Reader

FishAI

联系邮箱 441953276@qq.com

相关标签