Commit d68b237f authored by IT18396164-Silva K.K.S's avatar IT18396164-Silva K.K.S

Merge branch 'v1/be/IT18396164/classification_and_clustering_ml' into 'development'

V1/be/it18396164/classification and clustering ml

See merge request !1
parents c71d63e4 1f8d77d9
...@@ -24,27 +24,7 @@ import json ...@@ -24,27 +24,7 @@ import json
class TopicCluster: class TopicCluster:
def cluster(): def cluster():
texts = [ #clustering with k-means
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Registrar of Muslim Marriages -Gampaha",
"Registrar of Births Deaths and Marriages",
"Registrar of Muslim Marriages -Ratnapura",
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Teacher Services 2021 for sinhala,Tamil and English-Kaluthara District",
"Teacher Services 2021 for sinhala,Tamil and English-Galle District",
"Teacher Services for sinhala-Ratnapura District",
"Medical officer preliminary grade i",
"medical consultant",
"medical officer grade i",
"medical officer grade ii",
"Medical officer preliminary grade ii",
"Community Development Assistant",
"Data Processing Assistant -colombo",
"Community Development Assistant",
"Data Processing Assistant -ratnapura",
"Social Development Assistant",
]
count_vectorizer = CountVectorizer() count_vectorizer = CountVectorizer()
# .fit_transfer TOKENIZES and COUNTS # .fit_transfer TOKENIZES and COUNTS
...@@ -79,11 +59,6 @@ class TopicCluster: ...@@ -79,11 +59,6 @@ class TopicCluster:
X = count_vectorizer.fit_transform(texts) X = count_vectorizer.fit_transform(texts)
# print(count_vectorizer.get_feature_names()) # print(count_vectorizer.get_feature_names())
tfidf_vectorizer = TfidfVectorizer(
stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(texts)
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
tfidf_vectorizer = TfidfVectorizer( tfidf_vectorizer = TfidfVectorizer(
stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1') stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(texts) X = tfidf_vectorizer.fit_transform(texts)
...@@ -98,7 +73,7 @@ class TopicCluster: ...@@ -98,7 +73,7 @@ class TopicCluster:
idf_df = pd.DataFrame( idf_df = pd.DataFrame(
X.toarray(), columns=idf_vectorizer.get_feature_names()) X.toarray(), columns=idf_vectorizer.get_feature_names())
# idf_df # idf_df
#updated clustering code
# use_idf=True is default, but I'll leave it in # use_idf=True is default, but I'll leave it in
idf_vectorizer = TfidfVectorizer( idf_vectorizer = TfidfVectorizer(
stop_words='english', tokenizer=stemming_tokenizer, use_idf=True, norm='l2') stop_words='english', tokenizer=stemming_tokenizer, use_idf=True, norm='l2')
...@@ -114,12 +89,7 @@ class TopicCluster: ...@@ -114,12 +89,7 @@ class TopicCluster:
l2_df = pd.DataFrame( l2_df = pd.DataFrame(
X.toarray(), columns=l2_vectorizer.get_feature_names()) X.toarray(), columns=l2_vectorizer.get_feature_names())
# l2_df # l2_df
#removing unwanted file
# Initialize a vectorizer
vectorizer = TfidfVectorizer(
use_idf=True, tokenizer=stemming_tokenizer, stop_words='english')
X = vectorizer.fit_transform(texts)
# distortions = [] # distortions = []
# K = range(1, 8) # K = range(1, 8)
# for k in K: # for k in K:
...@@ -133,7 +103,7 @@ class TopicCluster: ...@@ -133,7 +103,7 @@ class TopicCluster:
# plt.ylabel('Distortion') # plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k') # plt.title('The Elbow Method showing the optimal k')
# plt.show() # plt.show()
#get number of clusters automatically
def get_k_value(y_list): def get_k_value(y_list):
try: try:
coordinate_dict = {} coordinate_dict = {}
...@@ -182,7 +152,7 @@ class TopicCluster: ...@@ -182,7 +152,7 @@ class TopicCluster:
# transformed_everything[-n_clusters:, 1], marker='o') # transformed_everything[-n_clusters:, 1], marker='o')
# plt.show() # plt.show()
# check the similarity using leveinstain distance
def get_category_name(top_words): def get_category_name(top_words):
avgList = [] avgList = []
category = '' category = ''
...@@ -209,7 +179,7 @@ class TopicCluster: ...@@ -209,7 +179,7 @@ class TopicCluster:
print("\n") print("\n")
return category return category
#using topic modeling get the top words
cluster_names_list = [] cluster_names_list = []
print("Top terms per cluster:") print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1] order_centroids = km.cluster_centers_.argsort()[:, ::-1]
...@@ -221,7 +191,7 @@ class TopicCluster: ...@@ -221,7 +191,7 @@ class TopicCluster:
print("Cluster {}: {}".format(cat, ' '.join(top_ten_words))) print("Cluster {}: {}".format(cat, ' '.join(top_ten_words)))
print("\n") print("\n")
print(cluster_names_list) print(cluster_names_list)
# matching the clustering output
newkmlist_ = km.labels_ newkmlist_ = km.labels_
for tuple in cluster_names_list: for tuple in cluster_names_list:
newkmlist_ = [tuple[1] if i == tuple[0] else i for i in newkmlist_] newkmlist_ = [tuple[1] if i == tuple[0] else i for i in newkmlist_]
......
...@@ -187,6 +187,7 @@ def PdfBreaker(gz_url): ...@@ -187,6 +187,7 @@ def PdfBreaker(gz_url):
filteredRunList = listRuns[beginIndex: len(listRuns) - 1] filteredRunList = listRuns[beginIndex: len(listRuns) - 1]
getTopics(filteredRunList, id, dt, gnum, getTopics(filteredRunList, id, dt, gnum,
exType, firstRun, pubDate) exType, firstRun, pubDate)
#Get the job titles
def getTopics(listRuns, counter, dt, gnum, exType, firstRun, pubDate): def getTopics(listRuns, counter, dt, gnum, exType, firstRun, pubDate):
dictTD = [{"key": "value"}] dictTD = [{"key": "value"}]
...@@ -293,7 +294,7 @@ def PdfBreaker(gz_url): ...@@ -293,7 +294,7 @@ def PdfBreaker(gz_url):
# print(json_object) # print(json_object)
return json_object return json_object
#Classify the main topic
def main_breaker(pubDate_, document=None, js=None): def main_breaker(pubDate_, document=None, js=None):
is_post_vacant = False is_post_vacant = False
is_examinations = False is_examinations = False
...@@ -330,13 +331,14 @@ def PdfBreaker(gz_url): ...@@ -330,13 +331,14 @@ def PdfBreaker(gz_url):
gazetteno = re.sub("[^0-9]", "", gazetteno) gazetteno = re.sub("[^0-9]", "", gazetteno)
pubDate = pubDate_ pubDate = pubDate_
print("date: ", pubDate) print("date: ", pubDate)
print('===============================================================') print('===================================================================')
print("gazette no -", gazetteno) print("gazette no -", gazetteno)
print('===============================================================') print('===================================================================')
#check the output
isScraped = Gazette.objects.all().filter(GazetteNo=gazetteno) isScraped = Gazette.objects.all().filter(GazetteNo=gazetteno)
isScrapedStr = str(isScraped) isScrapedStr = str(isScraped)
if isScrapedStr == '<QuerySet []>': if isScrapedStr == '<QuerySet []>':
#get job tiltles
js = getTopics(post_vacant_run_list, js = getTopics(post_vacant_run_list,
0, dictTD, gazetteno, 0, firstRun, pubDate) 0, dictTD, gazetteno, 0, firstRun, pubDate)
js2 = getTopics(examinations_run_list, js2 = getTopics(examinations_run_list,
...@@ -347,7 +349,7 @@ def PdfBreaker(gz_url): ...@@ -347,7 +349,7 @@ def PdfBreaker(gz_url):
else: else:
js = {""} js = {""}
return js return js
# Start classify main topics
# Main function of PDFBreaker # Main function of PDFBreaker
pdfFiles = [] pdfFiles = []
js = None js = None
......
...@@ -36,7 +36,7 @@ const LoadingIndicator = (props) => { ...@@ -36,7 +36,7 @@ const LoadingIndicator = (props) => {
) )
); );
}; };
//set default variable
export default function Vacancy() { export default function Vacancy() {
const [vac, setVac] = useState([]); const [vac, setVac] = useState([]);
const [filteredData, setFilteredData] = useState(vac); const [filteredData, setFilteredData] = useState(vac);
...@@ -50,7 +50,7 @@ export default function Vacancy() { ...@@ -50,7 +50,7 @@ export default function Vacancy() {
fetchVacancy(); fetchVacancy();
fetchVacancy(); fetchVacancy();
} }
//fetch vacancy
function fetchVacancy() { function fetchVacancy() {
trackPromise( trackPromise(
fetch(process.env.REACT_APP_API + "vacancy/" + selectedClient) fetch(process.env.REACT_APP_API + "vacancy/" + selectedClient)
...@@ -67,7 +67,7 @@ export default function Vacancy() { ...@@ -67,7 +67,7 @@ export default function Vacancy() {
fetchVacancy(); fetchVacancy();
return () => {}; return () => {};
}, []); }, []);
//refresh clustering vacancy
function refreshData() { function refreshData() {
trackPromise( trackPromise(
fetch(process.env.REACT_APP_API + "refreshClustering_Vacancy/") fetch(process.env.REACT_APP_API + "refreshClustering_Vacancy/")
...@@ -79,11 +79,11 @@ export default function Vacancy() { ...@@ -79,11 +79,11 @@ export default function Vacancy() {
}) })
); );
} }
//set the popup
const togglePopup = () => { const togglePopup = () => {
setshowPopup((showPopup) => !showPopup); setshowPopup((showPopup) => !showPopup);
}; };
//set the category
return ( return (
<> <>
<div> <div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment