Clustering code edited

parent ee477dd0
...@@ -24,27 +24,7 @@ import json ...@@ -24,27 +24,7 @@ import json
class TopicCluster: class TopicCluster:
def cluster(): def cluster():
texts = [ #clustering with k-means
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Registrar of Muslim Marriages -Gampaha",
"Registrar of Births Deaths and Marriages",
"Registrar of Muslim Marriages -Ratnapura",
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Teacher Services 2021 for sinhala,Tamil and English-Kaluthara District",
"Teacher Services 2021 for sinhala,Tamil and English-Galle District",
"Teacher Services for sinhala-Ratnapura District",
"Medical officer preliminary grade i",
"medical consultant",
"medical officer grade i",
"medical officer grade ii",
"Medical officer preliminary grade ii",
"Community Development Assistant",
"Data Processing Assistant -colombo",
"Community Development Assistant",
"Data Processing Assistant -ratnapura",
"Social Development Assistant",
]
count_vectorizer = CountVectorizer() count_vectorizer = CountVectorizer()
# .fit_transfer TOKENIZES and COUNTS # .fit_transfer TOKENIZES and COUNTS
...@@ -109,12 +89,7 @@ class TopicCluster: ...@@ -109,12 +89,7 @@ class TopicCluster:
l2_df = pd.DataFrame( l2_df = pd.DataFrame(
X.toarray(), columns=l2_vectorizer.get_feature_names()) X.toarray(), columns=l2_vectorizer.get_feature_names())
# l2_df # l2_df
#removing unwanted file
# Initialize a vectorizer
vectorizer = TfidfVectorizer(
use_idf=True, tokenizer=stemming_tokenizer, stop_words='english')
X = vectorizer.fit_transform(texts)
# distortions = [] # distortions = []
# K = range(1, 8) # K = range(1, 8)
# for k in K: # for k in K:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment