Commit d68b237f authored by IT18396164-Silva K.K.S's avatar IT18396164-Silva K.K.S

Merge branch 'v1/be/IT18396164/classification_and_clustering_ml' into 'development'

V1/be/it18396164/classification and clustering ml

See merge request !1
parents c71d63e4 1f8d77d9
......@@ -24,27 +24,7 @@ import json
class TopicCluster:
def cluster():
texts = [
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Registrar of Muslim Marriages -Gampaha",
"Registrar of Births Deaths and Marriages",
"Registrar of Muslim Marriages -Ratnapura",
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Teacher Services 2021 for sinhala,Tamil and English-Kaluthara District",
"Teacher Services 2021 for sinhala,Tamil and English-Galle District",
"Teacher Services for sinhala-Ratnapura District",
"Medical officer preliminary grade i",
"medical consultant",
"medical officer grade i",
"medical officer grade ii",
"Medical officer preliminary grade ii",
"Community Development Assistant",
"Data Processing Assistant -colombo",
"Community Development Assistant",
"Data Processing Assistant -ratnapura",
"Social Development Assistant",
]
#clustering with k-means
count_vectorizer = CountVectorizer()
# .fit_transfer TOKENIZES and COUNTS
......@@ -79,11 +59,6 @@ class TopicCluster:
X = count_vectorizer.fit_transform(texts)
# print(count_vectorizer.get_feature_names())
tfidf_vectorizer = TfidfVectorizer(
stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(texts)
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
tfidf_vectorizer = TfidfVectorizer(
stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(texts)
......@@ -98,7 +73,7 @@ class TopicCluster:
idf_df = pd.DataFrame(
X.toarray(), columns=idf_vectorizer.get_feature_names())
# idf_df
#updated clustering code
# use_idf=True is default, but I'll leave it in
idf_vectorizer = TfidfVectorizer(
stop_words='english', tokenizer=stemming_tokenizer, use_idf=True, norm='l2')
......@@ -114,12 +89,7 @@ class TopicCluster:
l2_df = pd.DataFrame(
X.toarray(), columns=l2_vectorizer.get_feature_names())
# l2_df
# Initialize a vectorizer
vectorizer = TfidfVectorizer(
use_idf=True, tokenizer=stemming_tokenizer, stop_words='english')
X = vectorizer.fit_transform(texts)
#removing unwanted file
# distortions = []
# K = range(1, 8)
# for k in K:
......@@ -133,7 +103,7 @@ class TopicCluster:
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()
#get number of clusters automatically
def get_k_value(y_list):
try:
coordinate_dict = {}
......@@ -182,7 +152,7 @@ class TopicCluster:
# transformed_everything[-n_clusters:, 1], marker='o')
# plt.show()
# check the similarity using leveinstain distance
def get_category_name(top_words):
avgList = []
category = ''
......@@ -209,7 +179,7 @@ class TopicCluster:
print("\n")
return category
#using topic modeling get the top words
cluster_names_list = []
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
......@@ -221,7 +191,7 @@ class TopicCluster:
print("Cluster {}: {}".format(cat, ' '.join(top_ten_words)))
print("\n")
print(cluster_names_list)
# matching the clustering output
newkmlist_ = km.labels_
for tuple in cluster_names_list:
newkmlist_ = [tuple[1] if i == tuple[0] else i for i in newkmlist_]
......
......@@ -187,6 +187,7 @@ def PdfBreaker(gz_url):
filteredRunList = listRuns[beginIndex: len(listRuns) - 1]
getTopics(filteredRunList, id, dt, gnum,
exType, firstRun, pubDate)
#Get the job titles
def getTopics(listRuns, counter, dt, gnum, exType, firstRun, pubDate):
dictTD = [{"key": "value"}]
......@@ -293,7 +294,7 @@ def PdfBreaker(gz_url):
# print(json_object)
return json_object
#Classify the main topic
def main_breaker(pubDate_, document=None, js=None):
is_post_vacant = False
is_examinations = False
......@@ -330,13 +331,14 @@ def PdfBreaker(gz_url):
gazetteno = re.sub("[^0-9]", "", gazetteno)
pubDate = pubDate_
print("date: ", pubDate)
print('===============================================================')
print('===================================================================')
print("gazette no -", gazetteno)
print('===============================================================')
print('===================================================================')
#check the output
isScraped = Gazette.objects.all().filter(GazetteNo=gazetteno)
isScrapedStr = str(isScraped)
if isScrapedStr == '<QuerySet []>':
#get job tiltles
js = getTopics(post_vacant_run_list,
0, dictTD, gazetteno, 0, firstRun, pubDate)
js2 = getTopics(examinations_run_list,
......@@ -347,7 +349,7 @@ def PdfBreaker(gz_url):
else:
js = {""}
return js
# Start classify main topics
# Main function of PDFBreaker
pdfFiles = []
js = None
......
......@@ -36,7 +36,7 @@ const LoadingIndicator = (props) => {
)
);
};
//set default variable
export default function Vacancy() {
const [vac, setVac] = useState([]);
const [filteredData, setFilteredData] = useState(vac);
......@@ -50,7 +50,7 @@ export default function Vacancy() {
fetchVacancy();
fetchVacancy();
}
//fetch vacancy
function fetchVacancy() {
trackPromise(
fetch(process.env.REACT_APP_API + "vacancy/" + selectedClient)
......@@ -67,7 +67,7 @@ export default function Vacancy() {
fetchVacancy();
return () => {};
}, []);
//refresh clustering vacancy
function refreshData() {
trackPromise(
fetch(process.env.REACT_APP_API + "refreshClustering_Vacancy/")
......@@ -79,11 +79,11 @@ export default function Vacancy() {
})
);
}
//set the popup
const togglePopup = () => {
setshowPopup((showPopup) => !showPopup);
};
//set the category
return (
<>
<div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment