Commit ed1b3f11 authored by Dilip Wijethunga's avatar Dilip Wijethunga

Merge branch 'IT19240466' into 'master'

It19240466

See merge request !1
parents a8c72c93 5ad2479f
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/scraping.iml" filepath="$PROJECT_DIR$/.idea/scraping.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RunConfigurationProducerService">
<option name="ignoredProducers">
<set>
<option value="com.android.tools.idea.compose.preview.runconfiguration.ComposePreviewRunConfigurationProducer" />
</set>
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
</component>
</project>
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
### How to set up and run
##### Create virtual environment
###### Windows
py -3 -m venv <name of environment>
###### Linux/MaxOS
python3 -m venv <name of environment>
##### Activate virtual environment
###### Windows
<name of environment>\Scripts\activate
###### Linux/MaxOS
. <name of environment>/bin/activate
##### Install required libraries
pip install -r requirements.txt
##### Run app locally
python main.py
\ No newline at end of file
HEADER = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 Safari/537.36 '
}
This diff is collapsed.
This diff is collapsed.
import csv
import requests
from bs4 import BeautifulSoup
from config import HEADER
# html tags need to be scrapped
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'td', 'li', 'a']
# keywords csv file path
KEYWORDS_PATH = 'keywords.csv'
# init KEYWORDS global dictionary to store keywords and their respective weight
KEYWORDS = dict()
def test_with_bs4(url):
response = requests.get(url, headers=HEADER)
soup = BeautifulSoup(response.text, "html.parser")
file = open("test.html", "w+")
file.write(str(soup))
file.close()
def load_keywords():
# access global KEYWORDS
global KEYWORDS
KEYWORDS = dict()
# read csv file
with open(KEYWORDS_PATH) as csv_file:
# init csv reader
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
# header row
if line_count == 0:
print(f'Column names are {", ".join(row)}')
line_count += 1
# other rows
else:
KEYWORDS[row[0]] = int(row[1])
line_count += 1
print(f'Processed {line_count} lines.')
def scrapping_words(url):
# init word dictionary
words = dict()
# download html page of the website
response = requests.get(url, headers=HEADER)
# parse with bs4
soup = BeautifulSoup(response.text, "html.parser")
# iterate through html tags
for tag in TAGS:
# find all inner texts for the tag
rows = soup.find_all(tag)
# iterate through all rows found related to the given tag
for row in rows:
# inner text to lower
sentence = row.get_text().lower()
# keep only alphabet
sentence = ''.join(x for x in sentence if x.isalpha() or x == ' ')
# split into words
array = sentence.split(' ')
# cleaning array
modified_array = [e.strip() for e in array if len(e.strip()) > 0]
# iterate through each word
for word in modified_array:
# if word not exists in dict add it
if word not in words.keys():
words[word] = 0
# increase count by 1
words[word] += 1
# return scrapped words from the given webpage
return words
def calculate_score(words):
# init total score to zero initially
total_score = 0
# iterate through scrapped words
for word, frequency in words.items():
# if scrapped word exists in keywords
if word in KEYWORDS.keys():
# multiply weight of the keyword by frequency and add it to total score
total_score += (KEYWORDS[word] * words[word])
return total_score
if __name__ == "__main__":
load_keywords()
scrapped_words = scrapping_words("https://cointelegraph.com/")
score = calculate_score(scrapped_words)
print(f"Score = {score}")
# if score > 0:
# print("========> POSITIVE")
# else:
# print("========> NEGATIVE")
This source diff could not be displayed because it is too large. You can view the blob instead.
bs4==0.0.1
requests==2.26.0
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import numpy as np\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# load vectorizer\n",
"path_vectorizer = 'final model/vectorizer.pickle'\n",
"with open(path_vectorizer, 'rb') as data:\n",
" vectorizer = pickle.load(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# load model\n",
"path_model = 'final model/best_rfc.pickle'\n",
"with open(path_model, 'rb') as data:\n",
" model = pickle.load(data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] /Users/ameshmjayaweera/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /Users/ameshmjayaweera/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Downloading punkt and wordnet from NLTK\n",
"nltk.download('punkt')\n",
"print(\"------------------------------------------------------------\")\n",
"nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/ameshmjayaweera/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Downloading the stop words list\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Saving the lemmatizer into an object\n",
"wordnet_lemmatizer = WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Loading the stop words in english\n",
"stop_words = list(stopwords.words('english'))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def pre_processing(sentence):\n",
" # 1.1. Replace \\n and \\t\n",
" sentence = sentence.replace(\"\\r\", \" \")\n",
" sentence = sentence.replace(\"\\n\", \" \")\n",
" \n",
" # 1.2. Convert to lowercase\n",
" sentence = sentence.lower()\n",
" \n",
" # 1.3. Remove punctuation signs\n",
" punctuation_signs = list(\"?:!.,;-$&^*%(){}[]/><@#~`|+_=“”…’−‘\")\n",
" for punct_sign in punctuation_signs:\n",
" sentence = sentence.replace(punct_sign, '')\n",
" \n",
" # 1.4. Remove possessive pronouns\n",
" sentence = sentence.replace(\"'s\", \"\")\n",
" \n",
" # 1.5. Remove numbers\n",
" digits = list(\"1234567890\")\n",
" for digit in digits:\n",
" sentence = sentence.replace(digit, '')\n",
" \n",
" # 1.6. Remove single quote and double quote\n",
" sentence = sentence.replace(\"'\", \"\")\n",
" sentence = sentence.replace('\"', '')\n",
" \n",
" # 1.7. Lemmatization\n",
" lemmatized_list = []\n",
" text_words = sentence.split(\" \")\n",
" for word in text_words:\n",
" lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos=\"v\"))\n",
" sentence = \" \".join(lemmatized_list)\n",
"\n",
" # 1.8. Remove Stop words\n",
" for stop_word in stop_words:\n",
" regex_stopword = r\"\\b\" + stop_word + r\"\\b\"\n",
" sentence = sentence.replace(regex_stopword, '')\n",
" \n",
" # 1.9. Remove Extra Spaces\n",
" sentence = sentence.split()\n",
" sentence = \" \".join(sentence)\n",
" \n",
" return sentence"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"test_input_1 = 'or, how about this. terra was a bad investment because all cryptos operate as if they are ponzi schemes.'\n",
"test_input_2 = 'Honestly, after reading this post and many of the responses, I have to conclude most of the crypto-space is totally fucked. The consept of crypto has been entirely lost, waves of noobs arrive on crypto island, and instead of revelling in the freedom, do everything they can to plan their way to get back off of the island.'\n",
"test_input_3 = 'Funny how people think Bitcoin\\'s risk is comparable to stocks. A lot of these crypto \"investors\" are gonna learn the hard way sooner or later.'"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'or how about this terra be a bad investment because all cryptos operate as if they be ponzi scheme'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pre_processing(test_input_1)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'funny how people think bitcoin risk be comparable to stock a lot of these crypto investors be gonna learn the hard way sooner or later'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pre_processing(test_input_3)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"def predict(sentence):\n",
" sentence = pre_processing(sentence)\n",
" vector = vectorizer.transform([sentence]).toarray()\n",
" pred = model.predict(vector)\n",
" return pred[0]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict(test_input_1)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict(test_input_3)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
web: gunicorn app:app
\ No newline at end of file
### How to set up and run
##### Create virtual environment
###### Windows
py -3 -m venv <name of environment>
###### Linux/MaxOS
python3 -m venv <name of environment>
##### Activate virtual environment
###### Windows
<name of environment>\Scripts\activate
###### Linux/MaxOS
. <name of environment>/bin/activate
##### Install required libraries
pip3 install -r requirements.txt
##### Run app locally
flask run
from flask import Flask, jsonify
from flask_cors import CORS
from flask_apscheduler import APScheduler
from model import schedule_model_training, is_training, CURRENCIES
from web_scrapping import get_sentiment
app = Flask(__name__)
cors = CORS(app, resources={r"/crypto-currency/*": {"origins": "*"}})
app.config['CORS_HEADERS'] = 'Content-Type'
scheduler = APScheduler()
schedule_model_training()
'''
schedule model re-training
'''
scheduler.add_job(id='Scheduled Task', func=schedule_model_training, trigger="interval", seconds=3600)
scheduler.start()
@app.route('/crypto-currency', methods=['GET'])
def index():
return f"<div align='center'><h2>Crypto Currency Forecasting Sever is Active</h2></div>"
@app.route("/crypto-currency/predict", methods=['GET'])
def predict():
if is_training():
response = jsonify({
"message": "all forecasting models are training now!",
"code": 100
})
else:
data = dict()
for currency in list(CURRENCIES.keys()):
if CURRENCIES[currency]["enable"] and CURRENCIES[currency]["available_data"]:
data[currency] = {
"price": CURRENCIES[currency]["price"],
"volume": CURRENCIES[currency]["volume"],
"market_cap": CURRENCIES[currency]["market_cap"]
}
response = jsonify({
"code": 200,
"message": "Success",
"data": data
})
response.headers.add('Access-Control-Allow-Origin', '*')
return response, 200
@app.route("/crypto-currency/sentiment", methods=['GET'])
def sentiment():
response = jsonify({
"code": 200,
"message": "Success",
"sentiment": get_sentiment()
})
response.headers.add('Access-Control-Allow-Origin', '*')
return response, 200
if __name__ == "__main__":
app.run()
HEADER = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 Safari/537.36 '
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import os
import ssl
from urllib.request import Request, urlopen
import certifi
from model_training.pp_market_cap import pp_market_cap
from model_training.pp_price import pp_price
from model_training.pp_volume import pp_volume
from web_scrapping import start_web_scrapping, set_sentiment
DATABASE_DIR = f"database{os.sep}"
TRAINING = False
THRESHOLD = 1000000
CURRENCIES = {
"BTC_USD": {
"url": "https://coingecko.com/price_charts/export/1/usd.csv",
"available_data": False,
"path": None,
"enable": True,
"price": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
},
"volume": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
},
"market_cap": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
}
},
"ETH_USD": {
"url": "https://www.coingecko.com/price_charts/export/279/usd.csv",
"available_data": False,
"path": None,
"enable": True,
"price": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
},
"volume": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
},
"market_cap": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
}
},
"PKEX_USD": {
"url": "https://www.coingecko.com/price_charts/export/18616/usd.csv",
"available_data": False,
"path": None,
"enable": True,
"price": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
},
"volume": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
},
"market_cap": {
"today": 0,
"tomorrow": 0,
"score": 0,
"exceeded": False
}
}
}
def download_data_sources():
for currency in list(CURRENCIES.keys()):
CURRENCIES[currency]["available_data"] = False
CURRENCIES[currency]["path"] = None
for currency in list(CURRENCIES.keys()):
request = Request(
url=CURRENCIES[currency]["url"],
headers={'User-Agent': 'Mozilla/5.0'}
)
print(f"download data source for {currency}")
with urlopen(request, context=ssl.create_default_context(cafile=certifi.where())) as file:
downloaded_file = file.read().decode('utf-8')
csv_file = open(f'{DATABASE_DIR}{currency}.csv', "w+")
csv_file.write(downloaded_file)
csv_file.close()
CURRENCIES[currency]["available_data"] = True
CURRENCIES[currency]["path"] = f'{DATABASE_DIR}{currency}.csv'
print(f"successfully downloaded data source for {currency}")
def is_data_sources_configured():
for currency in list(CURRENCIES.keys()):
if CURRENCIES[currency]["enable"] and not CURRENCIES[currency]["available_data"]:
return False
return True
def set_training(_flag):
global TRAINING
TRAINING = _flag
def is_training():
return TRAINING
def schedule_model_training():
set_training(True)
print("start model training")
retry_count = 0
while retry_count < 3:
print("downloading data sources")
retry_count += 1
print(f"attempting - {retry_count}")
download_data_sources()
if is_data_sources_configured():
print("data sources successfully downloaded")
break
# model training
for currency in list(CURRENCIES.keys()):
if CURRENCIES[currency]["enable"] and CURRENCIES[currency]["available_data"]:
file_path = CURRENCIES[currency]["path"]
today_price, pred_price = pp_price(file_path)
today_volume, pred_volume = pp_volume(file_path)
today_market_cap, pred_market_cap = pp_market_cap(file_path)
# price
CURRENCIES[currency]["price"]["today"] = today_price
CURRENCIES[currency]["price"]["tomorrow"] = pred_price
score = ((pred_price - today_price) / today_price) * 10
if score < 0:
score = 0
CURRENCIES[currency]["price"]["score"] = score
flag = False
if pred_price >= THRESHOLD:
flag = True
CURRENCIES[currency]["price"]["exceeded"] = flag
# volume
CURRENCIES[currency]["volume"]["today"] = today_volume
CURRENCIES[currency]["volume"]["tomorrow"] = pred_volume
score = ((pred_volume - today_volume) / today_volume) * 10
if score < 0:
score = 0
CURRENCIES[currency]["volume"]["score"] = score
flag = False
if pred_volume >= THRESHOLD:
flag = True
CURRENCIES[currency]["volume"]["exceeded"] = flag
# market cap
CURRENCIES[currency]["market_cap"]["today"] = today_market_cap
CURRENCIES[currency]["market_cap"]["tomorrow"] = pred_market_cap
score = ((pred_market_cap - today_market_cap) / today_market_cap) * 10
if score < 0:
score = 0
CURRENCIES[currency]["market_cap"]["score"] = score
flag = False
if pred_market_cap >= THRESHOLD:
flag = True
CURRENCIES[currency]["market_cap"]["exceeded"] = flag
print("end model training")
set_training(False)
print(CURRENCIES)
set_sentiment('Not Available')
start_web_scrapping()
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
def model_training(training_data, scaler):
history = [x for x in training_data]
model = sm.tsa.arima.ARIMA(history, order=(5, 1, 0))
model_fit = model.fit()
output = model_fit.forecast()
return scaler.inverse_transform([[history[-1]]])[0][0], scaler.inverse_transform([[output[0]]])[0][0]
import pandas as pd
from sklearn.preprocessing import StandardScaler
from model_training.helper import model_training
def pp_market_cap(file_path):
df = pd.read_csv(file_path, delimiter=',', parse_dates=True, squeeze=True)
df.drop(['total_volume', 'price'], axis=1, inplace=True)
df['market_cap'] = df['market_cap'].fillna(0)
df['snapped_at'] = df['snapped_at'].apply(lambda x: x.split(' ')[0].strip())
df['snapped_at'] = pd.to_datetime(df['snapped_at'], infer_datetime_format=True)
scaler = StandardScaler()
df[['market_cap']] = scaler.fit_transform(df[['market_cap']])
training_data = df['market_cap'].values
return model_training(training_data, scaler)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from model_training.helper import model_training
def pp_price(file_path):
df = pd.read_csv(file_path, delimiter=',', parse_dates=True, squeeze=True)
df.drop(['total_volume', 'market_cap'], axis=1, inplace=True)
df['price'] = df['price'].fillna(0)
df['snapped_at'] = df['snapped_at'].apply(lambda x: x.split(' ')[0].strip())
df['snapped_at'] = pd.to_datetime(df['snapped_at'], infer_datetime_format=True)
scaler = StandardScaler()
df[['price']] = scaler.fit_transform(df[['price']])
training_data = df['price'].values
return model_training(training_data, scaler)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from model_training.helper import model_training
def pp_volume(file_path):
df = pd.read_csv(file_path, delimiter=',', parse_dates=True, squeeze=True)
df.drop(['price', 'market_cap'], axis=1, inplace=True)
df['total_volume'] = df['total_volume'].fillna(0)
df['snapped_at'] = df['snapped_at'].apply(lambda x: x.split(' ')[0].strip())
df['snapped_at'] = pd.to_datetime(df['snapped_at'], infer_datetime_format=True)
scaler = StandardScaler()
df[['total_volume']] = scaler.fit_transform(df[['total_volume']])
training_data = df['total_volume'].values
return model_training(training_data, scaler)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment