final EDA code file

ac7b89f2 · Rangana PWM · 0860c92c · ac7b89f2
Commit ac7b89f2 authored May 11, 2023 by Rangana PWM
Hide whitespace changes
Inline Side-by-side

Showing with 311 additions and 0 deletions

EDA/final_EDA_1.1.ipynb EDA/final_EDA_1.1.ipynb +311 -0

No files found.
--- a/EDA/final_EDA_1.1.ipynb
+++ b/EDA/final_EDA_1.1.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "c2c2b144",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "e17bc8d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DEPRESSIVE_TWEETS_CSV1 = 'depressive_tweets_processed.csv'\n",
+    "DEPRESSIVE_TWEETS_CSV2 = 'd_tweets.csv'\n",
+    "DEPRES_NROWS = 3200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "532b2c3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    object\n",
+       "1    object\n",
+       "2    object\n",
+       "3    object\n",
+       "4    object\n",
+       "5    object\n",
+       "6    object\n",
+       "7    object\n",
+       "8    object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "depressive_tweets_df1 = pd.read_csv(DEPRESSIVE_TWEETS_CSV1, sep = '|', header = None, usecols = range(0,9),nrows = DEPRES_NROWS)\n",
+    "depressive_tweets_df2 = pd.read_csv(DEPRESSIVE_TWEETS_CSV2,encoding = \"UTF-8\")\n",
+    "depressive_tweets_df1.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "0390bd5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "depressive_tweets_df1.columns= ['A','B','C','D','E','SentimentText','G','H','I']\n",
+    "depressive_tweets_df1 = depressive_tweets_df1[['SentimentText']]\n",
+    "depressive_tweets_df2['SentimentText'] = depressive_tweets_df2[['tweet']]\n",
+    "depressive_tweets_df2 = depressive_tweets_df2[['SentimentText']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "84044e3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SentimentText</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The lack of this understanding is a small but ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>i just told my parents about my depression and...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>depression is something i don't speak about ev...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Made myself a tortilla filled with pb&amp;j. My de...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>@WorldofOutlaws I am gonna need depression med...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3491</th>\n",
+       "      <td>Cough sneezes are tho worst</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3492</th>\n",
+       "      <td>I can be your sad whore ahaha</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3493</th>\n",
+       "      <td>Bro that feeling you get after you sneeze😍🥵🥴</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3494</th>\n",
+       "      <td>Long pisses are the best</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3495</th>\n",
+       "      <td>Dwight you ignorant slut.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5841 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          SentimentText\n",
+       "0     The lack of this understanding is a small but ...\n",
+       "1     i just told my parents about my depression and...\n",
+       "2     depression is something i don't speak about ev...\n",
+       "3     Made myself a tortilla filled with pb&j. My de...\n",
+       "4     @WorldofOutlaws I am gonna need depression med...\n",
+       "...                                                 ...\n",
+       "3491                        Cough sneezes are tho worst\n",
+       "3492                      I can be your sad whore ahaha\n",
+       "3493       Bro that feeling you get after you sneeze😍🥵🥴\n",
+       "3494                           Long pisses are the best\n",
+       "3495                          Dwight you ignorant slut.\n",
+       "\n",
+       "[5841 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "depressive_tweets_df = pd.concat([depressive_tweets_df1,depressive_tweets_df2])\n",
+    "depressive_tweets_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "2f4ae357",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "depressive_tweets_df['label'] = 'Depress_Level'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "dafc82b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_Depress = depressive_tweets_df.sample(frac=0.8, random_state=25)\n",
+    "test_Depress = depressive_tweets_df.drop(train_Depress.index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa5d507a",
+   "metadata": {},
+   "source": [
+    "# For non depress tweets "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "4b93d75d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NON_DEPRESSIVE_TWEETS_CSV1 = 'Sentiment Analysis Dataset 2.csv'\n",
+    "NON_DEPRESSIVE_TWEETS_CSV2 = 'non_d_tweets.csv'\n",
+    "\n",
+    "\n",
+    "non_depressive_tweets_df1 = pd.read_csv(NON_DEPRESSIVE_TWEETS_CSV1,encoding = \"UTF-8\",nrows = 2300)\n",
+    "non_depressive_tweets_df2 = pd.read_csv(NON_DEPRESSIVE_TWEETS_CSV2 , encoding = \"UTF-8\",nrows = 4300)\n",
+    "\n",
+    "\n",
+    "\n",
+    "non_depressive_tweets_df1 = non_depressive_tweets_df1[['SentimentText']]\n",
+    "non_depressive_tweets_df2['SentimentText'] = non_depressive_tweets_df2[['tweet']]\n",
+    "non_depressive_tweets_df2 = non_depressive_tweets_df2[['SentimentText']]\n",
+    "\n",
+    "\n",
+    "non_depressive_tweets_df = pd.concat([non_depressive_tweets_df1,non_depressive_tweets_df2])\n",
+    "non_depressive_tweets_df[\"label\"] = \"Non_Depress_Level\"\n",
+    "train_non_Depress = non_depressive_tweets_df.sample(frac=0.8, random_state=25)\n",
+    "test_non_Depress = non_depressive_tweets_df.drop(train_Depress.index)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "fc4ce3a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SentimentText    object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "non_depressive_tweets_df2.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "20cf5e1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.concat([train_non_Depress,train_Depress])\n",
+    "test = pd.concat([test_non_Depress,test_Depress])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "a894fad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = train.sample(frac=1)\n",
+    "test = test.sample(frac=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "d3303837",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train.to_csv('train1.csv')\n",
+    "test.to_csv('test1.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}