Commit ac7b89f2 authored by Rangana PWM's avatar Rangana PWM

final EDA code file

parent 0860c92c
{
"cells": [
{
"cell_type": "code",
"execution_count": 59,
"id": "c2c2b144",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "e17bc8d8",
"metadata": {},
"outputs": [],
"source": [
"DEPRESSIVE_TWEETS_CSV1 = 'depressive_tweets_processed.csv'\n",
"DEPRESSIVE_TWEETS_CSV2 = 'd_tweets.csv'\n",
"DEPRES_NROWS = 3200"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "532b2c3a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 object\n",
"1 object\n",
"2 object\n",
"3 object\n",
"4 object\n",
"5 object\n",
"6 object\n",
"7 object\n",
"8 object\n",
"dtype: object"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"depressive_tweets_df1 = pd.read_csv(DEPRESSIVE_TWEETS_CSV1, sep = '|', header = None, usecols = range(0,9),nrows = DEPRES_NROWS)\n",
"depressive_tweets_df2 = pd.read_csv(DEPRESSIVE_TWEETS_CSV2,encoding = \"UTF-8\")\n",
"depressive_tweets_df1.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "0390bd5e",
"metadata": {},
"outputs": [],
"source": [
"depressive_tweets_df1.columns= ['A','B','C','D','E','SentimentText','G','H','I']\n",
"depressive_tweets_df1 = depressive_tweets_df1[['SentimentText']]\n",
"depressive_tweets_df2['SentimentText'] = depressive_tweets_df2[['tweet']]\n",
"depressive_tweets_df2 = depressive_tweets_df2[['SentimentText']]"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "84044e3a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SentimentText</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>The lack of this understanding is a small but ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>i just told my parents about my depression and...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>depression is something i don't speak about ev...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Made myself a tortilla filled with pb&amp;j. My de...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>@WorldofOutlaws I am gonna need depression med...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3491</th>\n",
" <td>Cough sneezes are tho worst</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3492</th>\n",
" <td>I can be your sad whore ahaha</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3493</th>\n",
" <td>Bro that feeling you get after you sneeze😍🥵🥴</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3494</th>\n",
" <td>Long pisses are the best</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3495</th>\n",
" <td>Dwight you ignorant slut.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5841 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" SentimentText\n",
"0 The lack of this understanding is a small but ...\n",
"1 i just told my parents about my depression and...\n",
"2 depression is something i don't speak about ev...\n",
"3 Made myself a tortilla filled with pb&j. My de...\n",
"4 @WorldofOutlaws I am gonna need depression med...\n",
"... ...\n",
"3491 Cough sneezes are tho worst\n",
"3492 I can be your sad whore ahaha\n",
"3493 Bro that feeling you get after you sneeze😍🥵🥴\n",
"3494 Long pisses are the best\n",
"3495 Dwight you ignorant slut.\n",
"\n",
"[5841 rows x 1 columns]"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"depressive_tweets_df = pd.concat([depressive_tweets_df1,depressive_tweets_df2])\n",
"depressive_tweets_df"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "2f4ae357",
"metadata": {},
"outputs": [],
"source": [
"depressive_tweets_df['label'] = 'Depress_Level'"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "dafc82b9",
"metadata": {},
"outputs": [],
"source": [
"train_Depress = depressive_tweets_df.sample(frac=0.8, random_state=25)\n",
"test_Depress = depressive_tweets_df.drop(train_Depress.index)"
]
},
{
"cell_type": "markdown",
"id": "aa5d507a",
"metadata": {},
"source": [
"# For non depress tweets "
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "4b93d75d",
"metadata": {},
"outputs": [],
"source": [
"NON_DEPRESSIVE_TWEETS_CSV1 = 'Sentiment Analysis Dataset 2.csv'\n",
"NON_DEPRESSIVE_TWEETS_CSV2 = 'non_d_tweets.csv'\n",
"\n",
"\n",
"non_depressive_tweets_df1 = pd.read_csv(NON_DEPRESSIVE_TWEETS_CSV1,encoding = \"UTF-8\",nrows = 2300)\n",
"non_depressive_tweets_df2 = pd.read_csv(NON_DEPRESSIVE_TWEETS_CSV2 , encoding = \"UTF-8\",nrows = 4300)\n",
"\n",
"\n",
"\n",
"non_depressive_tweets_df1 = non_depressive_tweets_df1[['SentimentText']]\n",
"non_depressive_tweets_df2['SentimentText'] = non_depressive_tweets_df2[['tweet']]\n",
"non_depressive_tweets_df2 = non_depressive_tweets_df2[['SentimentText']]\n",
"\n",
"\n",
"non_depressive_tweets_df = pd.concat([non_depressive_tweets_df1,non_depressive_tweets_df2])\n",
"non_depressive_tweets_df[\"label\"] = \"Non_Depress_Level\"\n",
"train_non_Depress = non_depressive_tweets_df.sample(frac=0.8, random_state=25)\n",
"test_non_Depress = non_depressive_tweets_df.drop(train_Depress.index)\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "fc4ce3a8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SentimentText object\n",
"dtype: object"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"non_depressive_tweets_df2.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "20cf5e1e",
"metadata": {},
"outputs": [],
"source": [
"train = pd.concat([train_non_Depress,train_Depress])\n",
"test = pd.concat([test_non_Depress,test_Depress])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "a894fad1",
"metadata": {},
"outputs": [],
"source": [
"train = train.sample(frac=1)\n",
"test = test.sample(frac=1)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "d3303837",
"metadata": {},
"outputs": [],
"source": [
"train.to_csv('train1.csv')\n",
"test.to_csv('test1.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment