feat: knn model development

362d7e12 · maneeshalakshani · d3f12845 · 362d7e12 · 362d7e12 · 362d7e12
Commit 362d7e12 authored Sep 08, 2023 by maneeshalakshani
49 changed files
--- a/.gitignore
+++ b/.gitignore
+git-colab-terminal.ipynb
+**/__pycache__/
+.vscode/
+.creds/
+/models
+/datasets
+/wandb
+/evaluations
\ No newline at end of file
--- a/Algorithms/Algorithm_Replacement_LR.txt
+++ b/Algorithms/Algorithm_Replacement_LR.txt
+
+Inputs ->
+	feature names -> List of features used in the TFIDF vectoriser. Used to give words in the final output
+	threshold T -> Threshold to consider an output as counterfactual
+	classifier_fn (C) -> classifier prediction probability function in the random forest classifier
+	max_iter -> Maximum number of iterations run before termination if a CF is not found
+	max_time -> Maximum time that the algorithm run before termination if a CF is not found
+Output ->
+	list of words to remove or to change to reverse the model output.
+Process ->
+
+input -> Instance W -> document to classify. Has m words
+
+c = initial predicted class
+p = probability of the predicted class
+r = revert or not. Set to zero if predicted class is positive.
+
+n_explanations = 0
+explanations = {}
+
+combinations_to_expand = {}
+prob_combinations_to_expand = {}
+shap_combinations_to_expand = {}
+
+W = [] indices of features
+R = [] indices of replacement features of feature w_i if such replcement exsists.
+
+for i = 1 to m:
+	p_n = C(w_i) # Instance with w_i removed or changed to r_i
+	if (p_n < T):
+		explanations = explanations U w_i
+	else:
+		combinations_to_expand = combinations_to_expand U w_i
+		prob_combinations_to_expand = prob_combinations_to_expand U w_i
+	end if
+end for
+
+iteration = 1
+start time
+while True:
+	if iteration > max_iter OR time > max_time:
+		end while
+	combi = word combinations to remove where change in prediction score towards reverse class is maximal
+	new_combi_set = expanded combinations of combi without the exisiting combinations in explanations
+
+	for combo in new_combi_set do:
+		p_n = C(w_i)
+		if (p_n < T):
+			explanations = explanations U w_i
+		else:
+			combinations_to_expand = combinations_to_expand U w_i
+			prob_combinations_to_expand = prob_combinations_to_expand U w_i
+			shap_combinations_to_expand = shap_combinations_to_expand U shap_vals(w_i)
+		end if
+	end for
+	iteration ++
+	increment time
+end while
+
+replcement antonyms are generated from the wordnet library
+
+Gives faster results than removal as it pushes the results towards the reverse class
+
+Need to choose a proper antonym as antonyms maybe chosen to push towards the current class.
+This is prevented by using SHAP values to choose antonyms
+Used paper - Text Counterfactuals via Latent Optimization and Shapley-Guided Search
\ No newline at end of file
--- a/Algorithms/Algorithm_SHAP_SEDC.txt
+++ b/Algorithms/Algorithm_SHAP_SEDC.txt
+
+Inputs ->
+	feature names -> List of features used in the TFIDF vectoriser. Used to give words in the final output
+	threshold T -> Threshold to consider an output as counterfactual
+	classifier_fn (C) -> classifier prediction probability function in the random forest classifier
+	max_iter -> Maximum number of iterations run before termination if a CF is not found
+	max_time -> Maximum time that the algorithm run before termination if a CF is not found
+Output ->
+	list of words to remove to reverse the model output.
+Process ->
+
+input -> Instance W -> document to classify. Has m words
+
+c = initial predicted class
+p = probability of the predicted class
+r = revert or not. Set to zero if predicted class is positive.
+
+n_explanations = 0
+explanations = {}
+
+combinations_to_expand = {}
+prob_combinations_to_expand = {}
+shap_combinations_to_expand = {}
+
+shap_vals = {}_n shapley values of each feature with reference point taken as zero vector
+W = [] indices of features sorted in the descending order of shap values
+
+for i = 1 to m:
+	p_n = C(w_i) -> Instance with the feature w_i removed
+	if (p_n < T):
+		explanations = explanations U w_i
+	else:
+		combinations_to_expand = combinations_to_expand U w_i
+		prob_combinations_to_expand = prob_combinations_to_expand U w_i
+		shap_combinations_to_expand = shap_combinations_to_expand U shap_vals(w_i)
+	end if
+end for
+
+iteration = 1
+start time
+while True:
+	if iteration > max_iter OR time > max_time:
+		end while
+	combi = word combinations to remove where shap_combinations_to_expand is maximal
+	new_combi_set = expanded combinations of combi withiut the exisiting combinations in explanations
+
+	for combo in new_combi_set do:
+		p_n = C(w_i)
+		if (p_n < T):
+			explanations = explanations U w_i
+			end while
+		else:
+			combinations_to_expand = combinations_to_expand U w_i
+			prob_combinations_to_expand = prob_combinations_to_expand U w_i
+			shap_combinations_to_expand = shap_combinations_to_expand U shap_vals(w_i)
+		end if
+	end for
+	iteration ++
+	increment time
+end while
+
+Does not always converge ->
+	Even though shap values individually give measures for each feature better than score change,
+	for a set of features, algebraic sum of shap values is not a good measure.
+
+But for changes with less number of words like 1-4 words:
+	using shap values give faster results
+
+observation - Also gives better results when converting negative results to positive results when using shap values
+
+Can use feature_importance_ of Random forest instead of shapely values. But need to check if the feature contributes to positive or negative change in the current instance.
+
+x1 -> (2.98)
+
+x2 -> 2.98 - 0.6
+x3 -> 2.98 + 1.3
+x4 -> 2.98 + 2.0
+
--- a/Plan.txt
+++ b/Plan.txt
+Current plan - Random forest
+
+Get shapley values of all features for the current model. -> Reduces the randomness of removing features.
+(Text Counterfactuals via Latent Optimization and Shapley-Guided Search) THis paper gives replacements instead of removing the feature.
+Get the features contributing to each tree and the whole model for the Random forest -> Reduces the affecting feature number.
+Order the features by the shapley value.
+Change the value of leaf nodes and try to find cunterfactuals.
+Works only for the Random Forest.
+
+Current Implementation - Random Forest
+
+Get shapley values of all features for the current model. -> Reduces the randomness of removing features
+Get shapely values of features.
+Expand and prune the required instance to generate counterfactuals.
+Expand and prune order of the counterfactuals is sorted according to the shapely value of each feature.
+Run the prediction algorithm of the RF model to check if a desirable counterfactual is generated.
+If a desirable length counterfactual is generated, output the counterfactual in a text format.
+
+To be Developed -> 
+
+Get the desicion path of each tree in the random forest.
+Find the features affecting the prediction.
+Initially consider only the features with high shapely values and are in the set of features in decision trees.
+Check for the speed of operation.
+
+NEW 
+Previously -> used shap values (Accurate. But takes time to calculate.)
+    Therefore, Use feature_importance_ in random forest model.
+    These are calculated when training the model. -> Takes relatively less time when compared to shap. No time taken to calculate values.
+        Issues ->   2) Not instance specific
+                    1) Does not give direction of class change
+        solved ->   Get feature_importance_
+                    take instance
+                    remove feature importances not related to tte current instance -> Reduces memory consuption + takes less time to calculate.
+                    Take each feature -> Check the affect of removing that feature -> Assign a class change sign to the feature_importance_.
+
+
+
+Current Implementation - Logistic Regression
+
+Get shapley values of all features for the current model. -> Reduces the randomness of removing features
+Get shapely values of features.
+Expand and prune the required instance to generate counterfactuals.
+Expand and prune order of the counterfactuals is sorted according to the shapely value of each feature.
+Run the prediction algorithm of the LR model to check if a desirable counterfactual is generated.
+If a desirable length counterfactual is generated, output the counterfactual in a text format.
+
+To be Developed
+
+Get the feature importance of the current model using weights of features.
+Check for replacements of features which maximises the change in prediction probability. -> Calculated using shap values.
+Extension to https://arxiv.org/pdf/1906.09293.pdf -> by adding replacements instead of removals to maximise the change
\ No newline at end of file
--- a/Replacement_SEDC_LR.ipynb
+++ b/Replacement_SEDC_LR.ipynb
--- a/SHAP_SEDC_RF.ipynb
+++ b/SHAP_SEDC_RF.ipynb
--- a/configs/datasets/imdb.yaml
+++ b/configs/datasets/imdb.yaml
+name: imdb
+source_url: https://sliit-xai.s3.ap-south-1.amazonaws.com/datasets/imdb.zip
+paths:
+    data: imdb.csv
+split:
+    test: 0.1
+    train: 0.8
+    val: 0.1
+labels:
+    - negative
+    - positive
+extras:
+    input_encoder_path: tfidf.pkl
+    min_df: 30
--- a/configs/datasets/snli_1.0_contra.yaml
+++ b/configs/datasets/snli_1.0_contra.yaml
+name: snli_1.0_contra
+paths:
+    test: snli_1.0_contra_test.csv
+    train: snli_1.0_contra_train.csv
+    val: snli_1.0_contra_val.csv
+source_url: https://sliit-xai.s3.ap-south-1.amazonaws.com/datasets/snli_1.0_contra.zip
+model_name: t5-small
+max_token_len: 64
--- a/configs/models/analysis-models.yaml
+++ b/configs/models/analysis-models.yaml
+name: analysis-models
+source_url: https://sliit-xai.s3.ap-south-1.amazonaws.com/models/analysis-models.zip
+paths:
+    tfidf: tfidf.pkl
+    knn: knn.pkl
+    lr: lr.pkl
+    rf: rf.pkl
+    svm: svm.pkl
+models:
+    knn: knn.pkl
+    lr: lr.pkl
+    rf: rf.pkl
+    svm: svm.pkl
+encoders:
+    input_encoder_name: tfidf
+    output_encoder_name: lut
+    output_labels:
+        - negative
+        - positive
--- a/configs/models/t5-cf-generator.yaml
+++ b/configs/models/t5-cf-generator.yaml
+name: t5-cf-generator
+paths:
+    model: model.pt
+model_config: t5-small
+source_url: https://sliit-xai.s3.ap-south-1.amazonaws.com/models/t5-cf-generator.zip
--- a/configs/models/wf-cf-generator.yaml
+++ b/configs/models/wf-cf-generator.yaml
+name: wf-cf-generator
+flip_prob: 0.5
+flipping_tags:
+    - VB
+    - VBD
+    - VBG
+    - VBN
+    - VBP
+    - VBZ
+sample_prob_decay_factor: 0.2
--- a/deployment/backend/.gitignore
+++ b/deployment/backend/.gitignore
+/app/models
+/app/src
+/app/configs
\ No newline at end of file
--- a/deployment/backend/Dockerfile
+++ b/deployment/backend/Dockerfile
+
+# Define function directory
+ARG FUNCTION_DIR="/function"
+
+FROM python:3.9-buster as build-image
+
+# Install aws-lambda-cpp build dependencies
+RUN apt-get update && \
+  apt-get install -y \
+  g++ \
+  make \
+  cmake \
+  unzip \
+  libcurl4-openssl-dev
+
+# Include global arg in this stage of the build
+ARG FUNCTION_DIR
+
+# Create function directory
+RUN mkdir -p ${FUNCTION_DIR}
+
+# Install pip dependencies
+COPY cpu-requirements.txt .
+RUN pip install -r cpu-requirements.txt --index-url https://download.pytorch.org/whl/cpu --target ${FUNCTION_DIR}
+COPY requirements.txt .
+RUN pip install -r requirements.txt --target ${FUNCTION_DIR}
+# Install the runtime interface client
+RUN pip install \
+        --target ${FUNCTION_DIR} \
+        awslambdaric
+
+# Multi-stage build: grab a fresh copy of the base image
+FROM python:3.9-buster
+
+# Include global arg in this stage of the build
+ARG FUNCTION_DIR
+# Set working directory to function root directory
+WORKDIR ${FUNCTION_DIR}
+
+# Copy in the build image dependencies
+COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR}
+
+# Download nltk data
+RUN python3 -m nltk.downloader --dir /usr/share/nltk_data wordnet punkt stopwords averaged_perceptron_tagger tagsets
+
+# copy function code
+COPY app/models ${FUNCTION_DIR}/models
+COPY app/configs ${FUNCTION_DIR}/configs
+COPY app/src ${FUNCTION_DIR}/src
+COPY app/handlers ${FUNCTION_DIR}/handlers
+COPY app/app.py ${FUNCTION_DIR}/app.py
+
+# Setting the entry point
+ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
+CMD [ "app.handler" ]
+
--- a/deployment/backend/app/app.py
+++ b/deployment/backend/app/app.py
+from typing import Dict
+from handlers import evaluate, analyze
+import traceback
+
+
+def handler(event: Dict, context: Dict):
+    task = event["task"]
+    payload = event["payload"]
+    try:
+        if task == "evaluation":
+            body = evaluate(payload)
+            return {"status": 200, "body": body}
+        elif task == "analysis":
+            body = analyze(payload)
+            return {"status": 200, "body": body}
+        else:
+            body = "Invocation error"
+            return {"status": 400, "body": body}
+    except Exception as e:
+        body = str(e)
+        print("Error:", body)
+        print("Input:", event)
+        traceback.print_exc()
+        return {"status": 500, "body": body}
--- a/deployment/backend/app/handlers/__init__.py
+++ b/deployment/backend/app/handlers/__init__.py
+from .evaluation import evaluate
+from .analysis import analyze
--- a/deployment/backend/app/handlers/analysis.py
+++ b/deployment/backend/app/handlers/analysis.py
+from typing import Dict
+from src import TestBench
+
+tb_kwargs = {
+    "svm": {"cf_generator_config": "configs/models/wf-cf-generator.yaml"},
+    "knn": {"cf_generator_config": "configs/models/wf-cf-generator.yaml"},
+    "rf": {
+        "threshold_classifier": 0.49339999999983775,
+        "max_iter": 50,
+        "time_maximum": 120,
+    },
+    "lr": {
+        "threshold_classifier": 0.49179999999978463,
+        "max_iter": 50,
+        "time_maximum": 120,
+    },
+}
+
+
+def analyze(payload: Dict):
+    model_name = payload["model_name"]
+    configurations = payload["configurations"]
+    prompt = payload["prompt"]
+    variations = payload["variations"]
+    tb = TestBench(
+        model_path=f"models/analysis-models/{model_name}.pkl",
+        vectorizer_path="models/analysis-models/tfidf.pkl",
+        analyzer_name=model_name,
+        **tb_kwargs[model_name],
+    )
+    reports = tb(configurations, prompt, variations)
+    reports = "\n\n".join(reports)
+    return reports
--- a/deployment/backend/app/handlers/evaluation.py
+++ b/deployment/backend/app/handlers/evaluation.py
+from src.models import AnalysisModels as Models
+from typing import Dict
+
+
+def evaluate(payload: Dict) -> Dict:
+    texts = payload["texts"]
+    model_name = payload["model_name"]
+    models = Models("configs/models/analysis-models.yaml", "models/analysis-models/")
+    model = getattr(models, model_name)
+    scores, preds = model(texts)
+
+    return {"scores": scores, "predictions": preds}
--- a/deployment/backend/build-and-push.bash
+++ b/deployment/backend/build-and-push.bash
+#!/bin/bash
+
+
+if [ -e Dockerfile ]; then
+    # remove old repititions if they exist
+    if [ -e app/models ]; then
+        rm -r app/models
+    fi
+    if [ -e app/configs ]; then
+        rm -r app/configs
+    fi
+    if [ -e app/src ]; then
+        rm -r app/src
+    fi
+
+    # copy repetitions
+    cp -r ../../models app/models
+    cp -r ../../configs app/configs
+    cp -r ../../src app/src
+
+    # log into docker
+    aws ecr get-login-password --region ap-south-1 | sudo docker login --username AWS --password-stdin 065257926712.dkr.ecr.ap-south-1.amazonaws.com
+
+    # build and push
+    sudo docker build -t 065257926712.dkr.ecr.ap-south-1.amazonaws.com/xai:latest .
+    sudo docker push 065257926712.dkr.ecr.ap-south-1.amazonaws.com/xai:latest
+else
+    echo "Please change the working directory to the directory containing the Dockerfile"
+    exit 1
+fi
\ No newline at end of file
--- a/deployment/backend/cpu-requirements.txt
+++ b/deployment/backend/cpu-requirements.txt
+torch==2.0.1
\ No newline at end of file
--- a/deployment/backend/requirements.txt
+++ b/deployment/backend/requirements.txt
+scikit-learn==1.2.2
+nltk==3.8.1
+ipykernel==6.24.0
+ipywidgets==7.6.5
+pyyaml==6.0
+pandas==2.0.3
+beautifulsoup4==4.12.2
+wget==3.2
+numpy==1.23.5
+shap==0.41.0
+matplotlib==3.5.1
+seaborn==0.11.2
+ordered-set==4.1.0
+boto3==1.27.0
+transformers==4.31.0
+sagemaker==2.173.0
+sentencepiece==0.1.99
\ No newline at end of file
--- a/environment.yaml
+++ b/environment.yaml
+name: xai
+channels:
+    - defaults
+dependencies:
+    - python=3.9
+    - pip=23.1.2
+    - pip:
+          - scikit-learn==1.2.2
+          - nltk==3.8.1
+          - ipykernel==6.24.0
+          - ipywidgets==7.6.5
+          - pyyaml==6.0
+          - pandas==2.0.3
+          - beautifulsoup4==4.12.2
+          - wget==3.2
+          - numpy==1.23.5
+          - shap==0.41.0
+          - matplotlib==3.5.1
+          - seaborn==0.11.2
+          - ordered-set==4.1.0
+          - boto3==1.27.0
+          - torch==2.0.1
+          - transformers==4.31.0
+          - sagemaker==2.173.0
+          - sentencepiece==0.1.99
--- a/evaluations.ipynb
+++ b/evaluations.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.test_bench import TestBench\n",
+    "from src.datasets import IMDBDataset\n",
+    "\n",
+    "ds = IMDBDataset(config_path=\"./configs/datasets/imdb.yaml\", root=\"datasets/imdb\")\n",
+    "x, y = ds.x_test, ds.y_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tb = TestBench(\n",
+    "    model_path=\"./models/analysis-models/knn.pkl\",\n",
+    "    vectorizer_path=\"./models/analysis-models/tfidf.pkl\",\n",
+    "    cf_generator_config_path=\"./configs/models/wf-cf-generator.yaml\",\n",
+    "    analyzer_name=\"knn\"\n",
+    ")\n",
+    "tb.evaluate(x, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tb = TestBench(\n",
+    "    model_path=\"./models/analysis-models/svm.pkl\",\n",
+    "    vectorizer_path=\"./models/analysis-models/tfidf.pkl\",\n",
+    "    cf_generator_config_path=\"./configs/models/wf-cf-generator.yaml\",\n",
+    "    analyzer_name=\"svc\"\n",
+    ")\n",
+    "tb.evaluate(x, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tb = TestBench(\n",
+    "    model_path=\"./models/analysis-models/lr.pkl\",\n",
+    "    vectorizer_path=\"./models/analysis-models/tfidf.pkl\",\n",
+    "    cf_generator_config_path=\"./configs/models/wf-cf-generator.yaml\",\n",
+    "    analyzer_name=\"lr\"\n",
+    ")\n",
+    "tb.evaluate(x, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tb = TestBench(\n",
+    "    model_path=\"./models/analysis-models/rf.pkl\",\n",
+    "    vectorizer_path=\"./models/analysis-models/tfidf.pkl\",\n",
+    "    cf_generator_config_path=\"./configs/models/wf-cf-generator.yaml\",\n",
+    "    analyzer_name=\"rf\"\n",
+    ")\n",
+    "tb.evaluate(ds.x_test, ds.y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "xai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/generators.ipynb
+++ b/generators.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# T5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Local"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.train.t5 import fit\n",
+    "from src.datasets import CFGenerativeDataset\n",
+    "from torch.utils.data import DataLoader, Subset\n",
+    "from transformers import T5ForConditionalGeneration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE=16\n",
+    "EPOCHS=100\n",
+    "PATIENCE=10\n",
+    "SAVE_DIR=\".\"\n",
+    "MODEL_NAME=\"t5-small\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_ds = CFGenerativeDataset(\"./configs/datasets/snli_1.0_contra.yaml\", \"./datasets/snli_1.0_contra\", split=\"train\")\n",
+    "val_ds = CFGenerativeDataset(\"./configs/datasets/snli_1.0_contra.yaml\", \"./datasets/snli_1.0_contra\", split=\"val\")\n",
+    "\n",
+    "subset_indices = list(range(100))\n",
+    "train_ds = Subset(train_ds, subset_indices)\n",
+    "val_ds = Subset(val_ds, subset_indices)\n",
+    "\n",
+    "train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)\n",
+    "val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)\n",
+    "\n",
+    "model=T5ForConditionalGeneration.from_pretrained(MODEL_NAME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fit(\n",
+    "    train_dl,\n",
+    "    val_dl,\n",
+    "    model,\n",
+    "    epochs= 2,\n",
+    "    patience= 10,\n",
+    "    save_dir= \"models/t5-model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sagemaker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.pytorch import PyTorch\n",
+    "from sagemaker.inputs import TrainingInput\n",
+    "\n",
+    "def train()->None:\n",
+    "    estimator = PyTorch(\n",
+    "        entry_point=f\"sagemaker_t5.py\",\n",
+    "        role=\"arn:aws:iam::065257926712:role/SagemakerRole\",\n",
+    "        framework_version=\"2.0\",\n",
+    "        py_version=\"py310\",\n",
+    "        source_dir=\"src\",\n",
+    "        output_path=f\"s3://sliit-xai/training-jobs/results\",\n",
+    "        code_location=f\"s3://sliit-xai/training-jobs/code\",\n",
+    "        instance_count=1,\n",
+    "        instance_type=\"ml.g4dn.xlarge\",\n",
+    "        max_run=5 * 24 * 60 * 60\n",
+    "    )\n",
+    "    # Setting the input channels for tuning job\n",
+    "    s3_input_train = TrainingInput(s3_data=\"s3://sliit-xai/datasets/snli_1.0_contra/\", s3_data_type=\"S3Prefix\")\n",
+    "\n",
+    "    # Start job\n",
+    "    estimator.fit(inputs={\"train\": s3_input_train})\n",
+    "\n",
+    "train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.cf_generators import T5Generator\n",
+    "\n",
+    "cf_gen = T5Generator(\"./configs/models/t5-cf-generator.yaml\", \"./models/t5-cf-generator\", download=True)\n",
+    "review = \"\\\"Ice Age\\\" is an animated masterpiece that captivates both young and old audiences alike. The film's heartwarming and humorous storyline follows a mismatched group of prehistoric creatures on an epic adventure, which is filled with laughter, action, and valuable life lessons. The endearing characters, including Manny the mammoth, Sid the sloth, and Diego the saber-toothed tiger, effortlessly steal our hearts with their lovable quirks and undeniable chemistry. The animation is visually stunning, with breathtaking ice-capped landscapes and attention to detail that immerses viewers in a prehistoric wonderland. The movie's witty dialogue, clever jokes, and hilarious antics ensure that every moment is a joy to watch. Beyond the entertainment, \\\"Ice Age\\\" touches on themes of friendship, acceptance, and the importance of family, making it a truly heartwarming experience. This timeless classic stands the test of time, and its charm remains undiminished, making it a must-watch for anyone seeking an enchanting and delightful cinematic experience.\"\n",
+    "sentence_count = 4\n",
+    "contrads = cf_gen(review, sentence_count)\n",
+    "\n",
+    "print(\"\\n\".join(contrads))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# WordFlippingGenerator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.cf_generators import WordFlippingGenerator\n",
+    "\n",
+    "review = \"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me. The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice (crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) Watching Oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.\"\n",
+    "sentence_count = 4\n",
+    "\n",
+    "config_path = \"./configs/models/wf-cf-generator.yaml\"\n",
+    "wf = WordFlippingGenerator(config_path)\n",
+    "contrads = wf(review, sentence_count)\n",
+    "print(\"\\n\".join(contrads))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wf.describe_tags()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "xai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hyperparameter-tuning.ipynb
+++ b/hyperparameter-tuning.ipynb
+{
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from src.datasets import IMDBDataset\n",
+                "from src.models import RFModel, SVCModel, KNNModel, LRModel\n",
+                "import numpy as np"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "/home/avishka/Personal/Projects/xai/src/datasets.py:25: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+                        "  soup = BeautifulSoup(text, \"html.parser\")\n"
+                    ]
+                }
+            ],
+            "source": [
+                "ds_config_path = \"./datasets/imdb/dataset.yaml\"\n",
+                "ds = IMDBDataset(ds_config_path)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Number of trees in random forest\n",
+                "n_estimators = np.linspace(start = 10, stop = 100, num = 10).astype(int).tolist()\n",
+                "# Maximum number of levels in tree\n",
+                "max_depth = np.linspace(10, 100, num = 5).astype(int).tolist()\n",
+                "max_depth.append(None)\n",
+                "# Minimum number of samples required to split a node\n",
+                "min_samples_split = [2, 5, 10]\n",
+                "# Minimum number of samples required at each leaf node\n",
+                "min_samples_leaf = [1, 2, 4]\n",
+                "# Method of selecting samples for training each tree\n",
+                "bootstrap = [True, False]\n",
+                "rf_model = RFModel(n_estimators, max_depth, min_samples_split, min_samples_leaf, bootstrap)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "C = [0.1, 1, 10, 100]\n",
+                "gamma = [1, 0.1, 0.01, 0.001]\n",
+                "kernel = [\"rbf\"]\n",
+                "svc_model = SVCModel(C, gamma, kernel)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "n_neighbors = [30, 40, 50, 60, 70, 80, 90]\n",
+                "metric = [\"manhattan\", \"minkowski\"]\n",
+                "weights = [\"uniform\", \"distance\"]\n",
+                "knn_model = KNNModel(n_neighbors, metric, weights)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "penalty = [\"l1\", \"l2\", \"elasticnet\"]\n",
+                "C = np.logspace(-4, 4, 20)\n",
+                "solver = [\"lbfgs\", \"newton-cg\", \"sag\"]\n",
+                "max_iter = [100, 1000, 5000]\n",
+                "lr_model = LRModel(penalty, C, solver, max_iter)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "xai-env",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.10.11"
+        },
+        "orig_nbformat": 4
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}
--- a/imdb-counterfactual.ipynb
+++ b/imdb-counterfactual.ipynb
--- a/models-dataset-usage.ipynb
+++ b/models-dataset-usage.ipynb
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Matplotlib is building the font cache; this may take a moment.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating dataset\n",
+      "Downloading from source (https://sliit-xai.s3.ap-south-1.amazonaws.com/datasets/imdb.zip) to c:\\Users\\DELL\\Desktop\\research\\xai\\datasets\\imdb\\imdb.zip\n",
+      "Initializing objects\n",
+      "Preprocessing\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\DELL\\Desktop\\research\\xai\\src\\processors.py:30: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  soup = BeautifulSoup(text, \"html.parser\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Encoding\n",
+      "Dataset created\n",
+      "(4999, 11612) (40000, 11612) (5000, 11612) (4999,) (40000,) (5000,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.datasets import IMDBDataset\n",
+    "ds = IMDBDataset(config_path=\"./configs/datasets/imdb.yaml\", root=\"datasets/imdb\", download=True)\n",
+    "ds.set_split(\"train\")\n",
+    "print(ds.x_test.shape, ds.x_train.shape, ds.x_val.shape, ds.y_test.shape, ds.y_train.shape, ds.y_val.shape)\n",
+    "x, y = ds[0]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading from source (https://sliit-xai.s3.ap-south-1.amazonaws.com/models/analysis-models.zip) to c:\\Users\\DELL\\Desktop\\research\\xai\\models\\analysis-models\\analysis-models.zip\n",
+      "A collection of pretrained sklearn models.\n",
+      "Contains the models ['knn', 'lr', 'rf', 'svm']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.models import AnalysisModels\n",
+    "models = AnalysisModels(config_path=\"./configs/models/analysis-models.yaml\", root=\"models/analysis-models\", download=True)\n",
+    "print(models)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "models.knn(\"This is a nice movie\"), models.rf(\"This is very boring\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "models.knn.model.predict_proba"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "xai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/Replacement_SEDC_LR.ipynb
+++ b/notebooks/Replacement_SEDC_LR.ipynb
--- a/notebooks/rf_decision_path.ipynb
+++ b/notebooks/rf_decision_path.ipynb
--- a/src/__init__.py
+++ b/src/__init__.py
+from .test_bench import TestBench
--- a/src/analyzers/__init__.py
+++ b/src/analyzers/__init__.py
+from .knn import KNNAnalyzer
+from .svm import SVMMirrorAnalyzer as SVMAnalyzer
+from .rf import RFAnalyzer
+from .lr import LRAnalyzer
--- a/src/analyzers/base.py
+++ b/src/analyzers/base.py
+from typing import Any
+
+
+class BaseAnalyzer:
+    _model = None
+    _text_vectorizer = None
+    _cf_generator = None
+    _report_data = None
+
+    def explanation(self) -> str:
+        raise NotImplementedError("Method not implemented yet.")
+
+    def __call__(self, text: str, search_space: int) -> str:
+        raise NotImplementedError("Method not implemented yet.")
+
+    def set_config(self, config) -> None:
+        raise NotImplementedError("Method not implemented yet.")
--- a/src/analyzers/knn.py
+++ b/src/analyzers/knn.py
--- a/src/analyzers/lr.py
+++ b/src/analyzers/lr.py
--- a/src/analyzers/rf.py
+++ b/src/analyzers/rf.py
--- a/src/analyzers/svm.py
+++ b/src/analyzers/svm.py
--- a/src/cf_generators/__init__.py
+++ b/src/cf_generators/__init__.py
+from .t5 import T5Generator
+from .wf import WordFlippingGenerator
--- a/src/cf_generators/base.py
+++ b/src/cf_generators/base.py
+from typing import List, Dict, Any
+from ..models import DownloadableModel
+
+
+class BaseGenerator(DownloadableModel):
+    def __call__(self, inp: str, variations: int = 4) -> List[str]:
+        raise NotImplementedError("Method not implemented yet.")
+
+    def set_config(self, config: Dict[str, Any]) -> None:
+        raise NotImplementedError("Method not implemented yet.")
--- a/src/cf_generators/t5.py
+++ b/src/cf_generators/t5.py
+from .base import BaseGenerator
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+from typing import List, Dict
+import nltk
+
+
+class T5Generator(BaseGenerator):
+    def __init__(
+        self, config_path: str, root: str = None, download: bool = False
+    ) -> None:
+        super().__init__(config_path, root, download)
+        tokenizer = T5Tokenizer.from_pretrained(self.config["model_config"])
+        model = T5ForConditionalGeneration.from_pretrained(self.config["model_config"])
+        state = torch.load(self.config["paths"]["model"])
+        model.load_state_dict(state)
+        self.tokenizer = tokenizer
+        self.model = model
+
+    def __call__(self, inp: str, variations: int) -> List[str]:
+        # format input
+        inp = nltk.sent_tokenize(inp)
+        inp = ["contradict: " + sent for sent in inp]
+
+        # generate
+        sentence_sets = []
+        for sent in inp:
+            input_ids = self.tokenizer(sent, return_tensors="pt").input_ids
+            label_ids = self.model.generate(
+                input_ids, num_return_sequences=variations, num_beams=variations
+            )
+            sents = [
+                self.tokenizer.decode(label_id_row, skip_special_tokens=True)
+                for label_id_row in label_ids
+            ]
+            sentence_sets.append(sents)
+
+        paras = []
+        for sentences in zip(*sentence_sets):
+            para = " ".join(sentences)
+            paras.append(para)
+
+        return paras
+
+    def set_config(self, config: Dict) -> None:
+        print("WARN: Nothing to do")
--- a/src/cf_generators/wf.py
+++ b/src/cf_generators/wf.py
--- a/src/datasets.py
+++ b/src/datasets.py
--- a/src/models.py
+++ b/src/models.py
--- a/src/processors.py
+++ b/src/processors.py
--- a/src/requirements.txt
+++ b/src/requirements.txt
+bs4
+scikit-learn
+nltk
+wget
+sentencepiece
+transformers
+accelerate>=0.20.3
\ No newline at end of file
--- a/src/sagemaker_main.py
+++ b/src/sagemaker_main.py
+import os
+import joblib
+from datasets import IMDBDataset
+from models import SVCModel, RFModel, KNNModel, LRModel
+
+if __name__ == "__main__":
+    # Directory variables
+    data_dir = os.environ["SM_CHANNEL_TRAIN"]
+    intermediate_data_dir = os.environ["SM_OUTPUT_INTERMEDIATE_DIR"]
+    model_output_dir = os.environ["SM_MODEL_DIR"]
+    output_data_dir = os.environ["SM_OUTPUT_DATA_DIR"]
+
+    ds_config_path = f"{data_dir}/imdb.yaml"
+    ds = IMDBDataset(ds_config_path, vectorizer_fitted=False)
+    print("Dataset instantiated")
+
+    rf_model = RFModel()
+    svc_model = SVCModel()
+    knn_model = KNNModel()
+    lr_model = LRModel()
+    print("Models instantiated")
+
+    rf_model.fit(ds.x_train, ds.y_train)
+    print("RF completed")
+    svc_model.fit(ds.x_train, ds.y_train)
+    print("SVC completed")
+    knn_model.fit(ds.x_train, ds.y_train)
+    print("KNN completed")
+    lr_model.fit(ds.x_train, ds.y_train)
+    print("LR completed")
+
+    rf_model.save(f"{model_output_dir}/rf.pkl")
+    svc_model.save(f"{model_output_dir}/svm.pkl")
+    knn_model.save(f"{model_output_dir}/knn.pkl")
+    lr_model.save(f"{model_output_dir}/lr.pkl")
+    joblib.dump(ds.input_encoder, f"{model_output_dir}/tfidf.pkl")
+    print("Models saved")
--- a/src/sagemaker_t5.py
+++ b/src/sagemaker_t5.py
--- a/src/scripts/make-contra-ds.py
+++ b/src/scripts/make-contra-ds.py
--- a/src/test_bench.py
+++ b/src/test_bench.py
--- a/src/train/t5.py
+++ b/src/train/t5.py
--- a/train.ipynb
+++ b/train.ipynb