Merge branch 'v1/be/IT18396164/classification_and_clustering_ml' into 'development'

V1/be/it18396164/classification and clustering ml See merge request !1

Merge branch 'v1/be/IT18396164/classification_and_clustering_ml' into 'development'
V1/be/it18396164/classification and clustering ml See merge request !1
d68b237f · IT18396164-Silva K.K.S · c71d63e4 · 1f8d77d9 · d68b237f · d68b237f
Commit d68b237f authored Nov 25, 2021 by IT18396164-Silva K.K.S
Showing with 19 additions and 47 deletions

cgp1/CgpApp/topicclustering.py cgp1/CgpApp/topicclustering.py +7 -37

cgp1/CgpApp/views.py cgp1/CgpApp/views.py +7 -5

cgp1fe/src/components/Vacancynew.js cgp1fe/src/components/Vacancynew.js +5 -5

No files found.
--- a/cgp1/CgpApp/topicclustering.py
+++ b/cgp1/CgpApp/topicclustering.py
@@ -24,27 +24,7 @@ import json
 class TopicCluster:

    def cluster():
-        texts = [
-            "Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
-            "Registrar of Muslim Marriages -Gampaha",
-            "Registrar of Births Deaths and Marriages",
-            "Registrar of Muslim Marriages -Ratnapura",
-            "Registrar of Births Deaths and MarriagesAdditional Marriages Kandyan",
-            "Teacher Services 2021 for sinhala,Tamil and English-Kaluthara District",
-            "Teacher Services 2021 for sinhala,Tamil and English-Galle District",
-            "Teacher Services for sinhala-Ratnapura District",
-            "Medical officer preliminary grade i",
-            "medical consultant",
-            "medical officer grade i",
-            "medical officer grade ii",
-            "Medical officer preliminary grade ii",
-            "Community Development Assistant",
-            "Data Processing Assistant -colombo",
-            "Community Development Assistant",
-            "Data Processing Assistant -ratnapura",
-            "Social Development Assistant",
-        ]
-
+        #clustering with k-means
        count_vectorizer = CountVectorizer()

        # .fit_transfer TOKENIZES and COUNTS
@@ -79,11 +59,6 @@ class TopicCluster:
        X = count_vectorizer.fit_transform(texts)
        # print(count_vectorizer.get_feature_names())

-        tfidf_vectorizer = TfidfVectorizer(
-            stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
-        X = tfidf_vectorizer.fit_transform(texts)
-        pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
-
        tfidf_vectorizer = TfidfVectorizer(
            stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
        X = tfidf_vectorizer.fit_transform(texts)
@@ -98,7 +73,7 @@ class TopicCluster:
        idf_df = pd.DataFrame(
            X.toarray(), columns=idf_vectorizer.get_feature_names())
        # idf_df
-
+        #updated clustering code
        # use_idf=True is default, but I'll leave it in
        idf_vectorizer = TfidfVectorizer(
            stop_words='english', tokenizer=stemming_tokenizer, use_idf=True, norm='l2')
@@ -114,12 +89,7 @@ class TopicCluster:
        l2_df = pd.DataFrame(
            X.toarray(), columns=l2_vectorizer.get_feature_names())
        # l2_df
-
-        # Initialize a vectorizer
-        vectorizer = TfidfVectorizer(
-            use_idf=True, tokenizer=stemming_tokenizer, stop_words='english')
-        X = vectorizer.fit_transform(texts)
-
+#removing unwanted file
        # distortions = []
        # K = range(1, 8)
        # for k in K:
@@ -133,7 +103,7 @@ class TopicCluster:
        # plt.ylabel('Distortion')
        # plt.title('The Elbow Method showing the optimal k')
        # plt.show()
-
+        #get number of clusters automatically
        def get_k_value(y_list):
            try:
                coordinate_dict = {}
@@ -182,7 +152,7 @@ class TopicCluster:
        #             transformed_everything[-n_clusters:, 1], marker='o')

        # plt.show()
-
+        # check the similarity using leveinstain distance
        def get_category_name(top_words):
            avgList = []
            category = ''
@@ -209,7 +179,7 @@ class TopicCluster:
            print("\n")

            return category
-
+        #using topic modeling get the top words
        cluster_names_list = []
        print("Top terms per cluster:")
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
@@ -221,7 +191,7 @@ class TopicCluster:
            print("Cluster {}: {}".format(cat, ' '.join(top_ten_words)))
            print("\n")
        print(cluster_names_list)
-
+    #    matching the clustering output
        newkmlist_ = km.labels_
        for tuple in cluster_names_list:
            newkmlist_ = [tuple[1] if i == tuple[0] else i for i in newkmlist_]

--- a/cgp1/CgpApp/views.py
+++ b/cgp1/CgpApp/views.py
@@ -187,6 +187,7 @@ def PdfBreaker(gz_url):
        filteredRunList = listRuns[beginIndex: len(listRuns) - 1]
        getTopics(filteredRunList, id, dt, gnum,
                  exType, firstRun, pubDate)
+        #Get the job titles

    def getTopics(listRuns, counter, dt, gnum, exType, firstRun, pubDate):
        dictTD = [{"key": "value"}]
@@ -293,7 +294,7 @@ def PdfBreaker(gz_url):
        # print(json_object)

        return json_object
-
+        #Classify the main topic
    def main_breaker(pubDate_, document=None, js=None):
        is_post_vacant = False
        is_examinations = False
@@ -330,13 +331,14 @@ def PdfBreaker(gz_url):
        gazetteno = re.sub("[^0-9]", "", gazetteno)
        pubDate = pubDate_
        print("date: ", pubDate)
-        print('===============================================================')
+        print('===================================================================')
        print("gazette no -", gazetteno)
-        print('===============================================================')
-
+        print('===================================================================')
+        #check the output
        isScraped = Gazette.objects.all().filter(GazetteNo=gazetteno)
        isScrapedStr = str(isScraped)
        if isScrapedStr == '<QuerySet []>':
+            #get job tiltles
            js = getTopics(post_vacant_run_list,
                           0, dictTD, gazetteno, 0, firstRun, pubDate)
            js2 = getTopics(examinations_run_list,
@@ -347,7 +349,7 @@ def PdfBreaker(gz_url):
        else:
            js = {""}
        return js
-
+    # Start classify main topics
    # Main function of PDFBreaker
    pdfFiles = []
    js = None

--- a/cgp1fe/src/components/Vacancynew.js
+++ b/cgp1fe/src/components/Vacancynew.js
@@ -36,7 +36,7 @@ const LoadingIndicator = (props) => {
    )
  );
 };
-
+//set default variable
 export default function Vacancy() {
  const [vac, setVac] = useState([]);
  const [filteredData, setFilteredData] = useState(vac);
@@ -50,7 +50,7 @@ export default function Vacancy() {
    fetchVacancy();
    fetchVacancy();
  }
-
+//fetch vacancy
  function fetchVacancy() {
    trackPromise(
      fetch(process.env.REACT_APP_API + "vacancy/" + selectedClient)
@@ -67,7 +67,7 @@ export default function Vacancy() {
    fetchVacancy();
    return () => {};
  }, []);
-
+//refresh clustering vacancy
  function refreshData() {
    trackPromise(
      fetch(process.env.REACT_APP_API + "refreshClustering_Vacancy/")
@@ -79,11 +79,11 @@ export default function Vacancy() {
        })
    );
  }
-
+//set the popup
  const togglePopup = () => {
    setshowPopup((showPopup) => !showPopup);
  };
-
+//set the category
  return (
    <>
      <div>