Clustering code edited

parent ee477dd0
......@@ -24,27 +24,7 @@ import json
class TopicCluster:
def cluster():
texts = [
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Registrar of Muslim Marriages -Gampaha",
"Registrar of Births Deaths and Marriages",
"Registrar of Muslim Marriages -Ratnapura",
"Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
"Teacher Services 2021 for sinhala,Tamil and English-Kaluthara District",
"Teacher Services 2021 for sinhala,Tamil and English-Galle District",
"Teacher Services for sinhala-Ratnapura District",
"Medical officer preliminary grade i",
"medical consultant",
"medical officer grade i",
"medical officer grade ii",
"Medical officer preliminary grade ii",
"Community Development Assistant",
"Data Processing Assistant -colombo",
"Community Development Assistant",
"Data Processing Assistant -ratnapura",
"Social Development Assistant",
]
#clustering with k-means
count_vectorizer = CountVectorizer()
# .fit_transfer TOKENIZES and COUNTS
......@@ -109,12 +89,7 @@ class TopicCluster:
l2_df = pd.DataFrame(
X.toarray(), columns=l2_vectorizer.get_feature_names())
# l2_df
# Initialize a vectorizer
vectorizer = TfidfVectorizer(
use_idf=True, tokenizer=stemming_tokenizer, stop_words='english')
X = vectorizer.fit_transform(texts)
#removing unwanted file
# distortions = []
# K = range(1, 8)
# for k in K:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment