nlp_internship.py

# -*- coding: utf-8 -*-
"""Nlp internship.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1QStZaOSnlBo2ct3p9gPAmtW4ehPWqkdK
"""

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

train_df = pd.read_excel("/content/drive/MyDrive/Books/train.xlsx")
eval_df = pd.read_excel("/content/drive/MyDrive/Books/evaluation.xlsx")

!pip install transformers
!pip install datasets

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

# ... (the rest of the functions remain the same)

# Replace these lines with the actual paths to your train and eval CSV files
# train_df = pd.read_csv("train.csv")
# eval_df = pd.read_csv("eval.csv")
def generate_negatives(df, multiplier=1):
    negative_df = df.copy()
    for _ in range(multiplier):
        negative_df['reason'] = negative_df['reason'].apply(lambda x: ' '.join(random.sample(x.split(), len(x.split()))))
    negative_df['label'] = 0
    return pd.concat([df, negative_df], ignore_index=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset


def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)
    
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()  # This line was missing in your code

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

# ... (the rest of the functions remain the same)

def generate_negatives(df, multiplier=1):
    positive_df = df[df['label'] == 1]
    negative_df = df[df['label'] == 0].sample(len(positive_df), replace=True)
    balanced_df = pd.concat([positive_df, negative_df], ignore_index=True)
    return balanced_df.sample(frac=1).reset_index(drop=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset


def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

# ... (preprocess_dataset and compute_metrics functions remain the same)

train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)

    num_pos = len(train_df[train_df['label'] == 1])
    num_neg = len(train_df[train_df['label'] == 0])
    pos_weight = num_neg / num_pos

    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
        report_to="none",
    )

    model.config.loss_function = 'CrossEntropyLoss'
    model.config.pos_weight = pos_weight

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)
#This is to address the issue of poor class balance

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

# ... (generate_negatives, preprocess_dataset, and compute_metrics functions remain the same)
def generate_negatives(df, multiplier=1):
    positive_df = df[df['label'] == 1]
    negative_df = df[df['label'] == 0].sample(len(positive_df), replace=True)
    balanced_df = pd.concat([positive_df, negative_df], ignore_index=True)
    return balanced_df.sample(frac=1).reset_index(drop=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset


def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}


train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base', 'sentence-transformers/bert-base-nli-mean-tokens']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)

    num_pos = len(train_df[train_df['label'] == 1])
    num_neg = len(train_df[train_df['label'] == 0])
    pos_weight = num_neg / num_pos

    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=5,  # Increase the number of epochs
        learning_rate=2e-5,  # Fine-tune the learning rate
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
        report_to="none",
        lr_scheduler_type='cosine',  # Use cosine learning rate scheduler
    )

    model.config.loss_function = 'CrossEntropyLoss'
    model.config.pos_weight = pos_weight

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)

import pandas as pd

# Prepare the data for the table
data = []
for model_name, report in model_results.items():
    precision = report['1']['precision']
    recall = report['1']['recall']
    f1_score = report['1']['f1-score']
    data.append([model_name, precision, recall, f1_score])

# Create the DataFrame and set the column names
results_df = pd.DataFrame(data, columns=['Model', 'Precision', 'Recall', 'F1 Score'])

# Display the DataFrame
print(results_df)

"""To analyze the errors made by each model and discuss possible reasons for these errors, you can start by examining the misclassified examples. This can be done by comparing the ground truth labels with the model's predictions. You can use the following code snippet to get the misclassified examples for each model:

## Error Analysis
"""

misclassified_examples = {}

for model_name in models:
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    misclassified_indices = np.where(eval_dataset['label'] != preds)[0]
    misclassified_examples[model_name] = eval_df.iloc[misclassified_indices]

for model_name, misclassified_df in misclassified_examples.items():
    print(f"\nMisclassified examples for {model_name}:")
    print(misclassified_df)