BACKEND : nlp model added to ner_models folder, new method...

BACKEND : nlp model added to ner_models folder, new method class_name_detection added to class_model_detection_service, updated save_attribute_method

BACKEND : nlp model added to ner_models folder, new method...
BACKEND : nlp model added to ner_models folder, new method class_name_detection added to class_model_detection_service, updated save_attribute_method
b168f2ad · Weerasinghe D.N.H · faa4e97d · b168f2ad · b168f2ad · b168f2ad
Commit b168f2ad authored Sep 29, 2022 by Weerasinghe D.N.H
17 changed files
--- a/backend/database.db
+++ b/backend/database.db
--- a/backend/models/method_model.py
+++ b/backend/models/method_model.py
@@ -3,9 +3,9 @@ from config.database import db

 class Method(db.Model):
    id = db.Column(db.Integer, primary_key=True)
-    return_type = db.Column(db.String(50), nullable=False)
+    return_type = db.Column(db.String(50))
    name = db.Column(db.String(50), nullable=False)
-    access_spec = db.Column(db.String(50), nullable=False)
+    access_spec = db.Column(db.String(50))
    class_id = db.Column(db.Integer)

    def __repr__(self) -> str:

--- a/backend/ner_models/model-best/config.cfg
+++ b/backend/ner_models/model-best/config.cfg
+[paths]
+train = "./training_data.spacy"
+dev = "./training_data.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 0
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec","ner"]
+batch_size = 1000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file
--- a/backend/ner_models/model-best/meta.json
+++ b/backend/ner_models/model-best/meta.json
+{
+  "lang":"en",
+  "name":"pipeline",
+  "version":"0.0.0",
+  "spacy_version":">=3.4.1,<3.5.0",
+  "description":"",
+  "author":"",
+  "email":"",
+  "url":"",
+  "license":"",
+  "spacy_git_version":"Unknown",
+  "vectors":{
+    "width":0,
+    "vectors":0,
+    "keys":0,
+    "name":null,
+    "mode":"default"
+  },
+  "labels":{
+    "tok2vec":[
+
+    ],
+    "ner":[
+      "ACCESS_SP",
+      "ATTRIBUTE_NAME",
+      "DATA_TYPE",
+      "METHOD_NAME",
+      "MULTIPLICITY",
+      "PARAMETERS"
+    ]
+  },
+  "pipeline":[
+    "tok2vec",
+    "ner"
+  ],
+  "components":[
+    "tok2vec",
+    "ner"
+  ],
+  "disabled":[
+
+  ],
+  "performance":{
+    "ents_f":0.9938850387,
+    "ents_p":0.9918633035,
+    "ents_r":0.9959150327,
+    "ents_per_type":{
+      "ACCESS_SP":{
+        "p":0.995157385,
+        "r":0.9927536232,
+        "f":0.9939540508
+      },
+      "ATTRIBUTE_NAME":{
+        "p":0.9959183673,
+        "r":0.9959183673,
+        "f":0.9959183673
+      },
+      "DATA_TYPE":{
+        "p":0.9856321839,
+        "r":1.0,
+        "f":0.99276411
+      },
+      "METHOD_NAME":{
+        "p":0.993902439,
+        "r":0.993902439,
+        "f":0.993902439
+      },
+      "MULTIPLICITY":{
+        "p":0.9756097561,
+        "r":1.0,
+        "f":0.987654321
+      },
+      "PARAMETERS":{
+        "p":1.0,
+        "r":1.0,
+        "f":1.0
+      }
+    },
+    "tok2vec_loss":78.7692795898,
+    "ner_loss":1014.2115147574
+  }
+}
\ No newline at end of file
--- a/backend/ner_models/model-best/ner/cfg
+++ b/backend/ner_models/model-best/ner/cfg
+{
+  "moves":null,
+  "update_with_oracle_cut_size":100,
+  "multitasks":[
+
+  ],
+  "min_action_freq":1,
+  "learn_tokens":false,
+  "beam_width":1,
+  "beam_density":0.0,
+  "beam_update_prob":0.0,
+  "incorrect_spans_key":null
+}
\ No newline at end of file
--- a/backend/ner_models/model-best/ner/model
+++ b/backend/ner_models/model-best/ner/model
--- a/backend/ner_models/model-best/ner/moves
+++ b/backend/ner_models/model-best/ner/moves
+moves{"0":{},"1":{"ACCESS_SP":414,"DATA_TYPE":346,"ATTRIBUTE_NAME":256,"METHOD_NAME":166,"MULTIPLICITY":102,"PARAMETERS":26},"2":{"ACCESS_SP":414,"DATA_TYPE":346,"ATTRIBUTE_NAME":256,"METHOD_NAME":166,"MULTIPLICITY":102,"PARAMETERS":26},"3":{"ACCESS_SP":414,"DATA_TYPE":346,"ATTRIBUTE_NAME":256,"METHOD_NAME":166,"MULTIPLICITY":102,"PARAMETERS":26},"4":{"ACCESS_SP":414,"DATA_TYPE":346,"ATTRIBUTE_NAME":256,"METHOD_NAME":166,"MULTIPLICITY":102,"PARAMETERS":26,"":1},"5":{"":1}}cfgneg_key
\ No newline at end of file
--- a/backend/ner_models/model-best/tok2vec/cfg
+++ b/backend/ner_models/model-best/tok2vec/cfg
+{
+
+}
\ No newline at end of file
--- a/backend/ner_models/model-best/tok2vec/model
+++ b/backend/ner_models/model-best/tok2vec/model
--- a/backend/ner_models/model-best/tokenizer
+++ b/backend/ner_models/model-best/tokenizer
--- a/backend/ner_models/model-best/vocab/key2row
+++ b/backend/ner_models/model-best/vocab/key2row
+€
\ No newline at end of file
--- a/backend/ner_models/model-best/vocab/lookups.bin
+++ b/backend/ner_models/model-best/vocab/lookups.bin
+€
\ No newline at end of file
--- a/backend/ner_models/model-best/vocab/strings.json
+++ b/backend/ner_models/model-best/vocab/strings.json
--- a/backend/ner_models/model-best/vocab/vectors
+++ b/backend/ner_models/model-best/vocab/vectors
--- a/backend/ner_models/model-best/vocab/vectors.cfg
+++ b/backend/ner_models/model-best/vocab/vectors.cfg
+{
+  "mode":"default"
+}
\ No newline at end of file
--- a/backend/services/class_model_detection_service.py
+++ b/backend/services/class_model_detection_service.py
@@ -16,7 +16,6 @@ import spacy

 from config.database import db
 from models.component_model import Component
-from models.interface_model import Interface
 from models.method_model import Method

 ts.pytesseract.tesseract_cmd = r'C:\Users\DELL\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
@@ -45,7 +44,6 @@ def component_separation(filename, class_comp_id):
            elif category_index[class_id[index]]['name'] == 'interface':
                _image = crop_image_(image_nparray, boxes, index)
                _image = cv2.resize(_image, None, fx=2, fy=2)
-                class_details_detection(_image, class_comp_id)


 def class_object_detection(model_path, label_path, image_nparray):
@@ -65,7 +63,7 @@ def class_object_detection(model_path, label_path, image_nparray):

    detections['detection_classes'] = detections['detection_classes'].astype(np.int64)

-    accurate_indexes = [k for k, v in enumerate(detections['detection_scores']) if (v > 0.6)]
+    accurate_indexes = [k for k, v in enumerate(detections['detection_scores']) if (v > 0.7)]
    num_entities = len(accurate_indexes)

    class_id = operator.itemgetter(*accurate_indexes)(detections['detection_classes'])
@@ -74,45 +72,30 @@ def class_object_detection(model_path, label_path, image_nparray):


 def class_details_detection(_image, class_comp_id):
-    class_names = []
-    attributes = []
-    methods = []
+    attributes_methods = []

    mdl2_path = app.CLASS_COMP_SAVED_MODEL_PATH
    lbl2_path = app.CLASS_COMP_SAVED_LABEL_PATH
    boxes_class, num_entities, accurate_indexes, num_entities, category_index, class_id = class_object_detection(
        mdl2_path, lbl2_path, _image)

+    comp = class_name_detection(class_comp_id, _image, boxes_class, accurate_indexes)
+
    if num_entities > 1:
        for j in range(0, len(accurate_indexes)):
            if category_index[class_id[j]]['name'] == 'class_attributes':
-                attr_removed = crop_and_hide(_image, boxes_class, j)
-                class_names.append(attr_removed)
                class_attributes = crop_image_(_image, boxes_class, j)
                text = text_extraction(class_attributes)
-                attribute = save_attributes_methods(text, 'attribute')
-                attributes.append(attribute)
+                attributes = save_attributes_methods(text, 'attribute')
+                alter_attributes_methods(attributes, comp.id)

            elif category_index[class_id[j]]['name'] == 'class_methods':
                class_methods = crop_image_(_image, boxes_class, j)
                text = text_extraction(class_methods)
                print(text)
-                method_removed = crop_and_hide(class_names[0], boxes_class, j)
-                class_name = text_extraction(method_removed)
-                method = save_attributes_methods(text, 'method')
-                methods.append(method)
-                if ''.join(class_name) != '':
-                    if "interface" in ''.join(class_name):
-                        name = ''.join(class_name).replace("<<interface>>", "")
-                        comp = Component(class_answer=class_comp_id, name=name, type="interface")
-                    else:
-                        name = ''.join(class_name)
-                        comp = Component(class_answer=class_comp_id, name=name, type="class")
-
-                    db.session.add(comp)
-                    db.session.commit()
-                    alter_attributes_methods(methods, comp.id)
-                    class_names.clear()
+                methods = save_attributes_methods(text, 'method')
+                alter_attributes_methods(methods, comp.id)
+                print(text)


 def crop_image_(image, boxes, index):
@@ -141,47 +124,72 @@ def text_extraction(image):


 def save_attributes_methods(text, typ):
+    global saved_data
    nlp = spacy.load('en_core_web_sm')
    for element in text:
-        access = covert_to_access_specifier(element)
-        removable = str.maketrans('', '', '()')
-        nlp_output = list(filter(None, nlp(element.translate(removable))))
-        for token in nlp_output:
-            if token.text == ':':
-                previous_index = nlp_output.index(token) - 1
-                next_index = nlp_output.index(token) + 1
-                if typ == 'attribute':
-                    attr = Attribute(data_type=nlp_output[next_index], name=nlp_output[previous_index],
-                                     access_spec=access)
-                    db.session.add(attr)
-                    db.session.commit()
-                    return attr
-
-                else:
-                    method = Method(return_type=nlp_output[next_index], name=nlp_output[previous_index],
-                                    access_spec=access)
-                    db.session.add(method)
-                    db.session.commit()
-                    return method
+        print(element)
+        # removable = str.maketrans('', '', '()')
+        nlp_ner = spacy.load('ner_models/model-best')
+        nlp_output = nlp_ner(element)
+        attr = Attribute()
+        method = Method()
+
+        for token in nlp_output.ents:
+
+            if typ == 'attribute':
+                if token.label_ == 'ATTRIBUTE_NAME':
+                    attr.name = token.text
+
+                elif token.label_ == 'ACCESS_SP':
+                    attr.access_spec = covert_to_access_specifier(token.text)
+
+                elif token.label_ == 'DATA_TYPE':
+                    attr.data_type = token.text
+
+            elif typ == 'method':
+                if token.label_ == 'METHOD_NAME':
+                    method.name = token.text
+
+                elif token.label_ == 'ACCESS_SP':
+                    method.access_spec = covert_to_access_specifier(token.text)
+
+                elif token.label_ == 'DATA_TYPE':
+                    method.return_type = token.text
+
+        if typ == 'attribute':
+            print(attr)
+            db.session.add(attr)
+            db.session.commit()
+            saved_data.append(attr)
+
+        else:
+            print(method)
+            db.session.add(method)
+            db.session.commit()
+            saved_data.append(method)
+
+    return saved_data


 def alter_attributes_methods(element_list, class_id):
    for element in element_list:
+        print(class_id)
+        print(element_list)
        element.class_id = class_id
        db.session.commit()


 def covert_to_access_specifier(access):
-    if access.startswith('-'):
+    if access == "-":
        return "Private"

-    elif access.startswith('#'):
+    elif access == "#":
        return "Protected"

-    if access.startswith('+'):
+    if access == "+":
        return "Public"

-    elif access.startswith('~'):
+    elif access == "~":
        return "Package"

    else:
@@ -190,12 +198,28 @@ def covert_to_access_specifier(access):

 def crop_and_hide(image, boxes, index):
    height, width, c = image.shape
-    # crop box format: xmin, ymin, xmax, ymax
-    ymin = boxes[index][0] * height
-    xmin = boxes[index][1] * width
-    ymax = boxes[index][2] * height
-    xmax = boxes[index][3] * width
-
-    image[int(ymin):int(ymax), int(xmin):int(xmax)] = 255
+    for i in range(0, len(index)):
+        ymin = boxes[i][0] * height
+        xmin = boxes[i][1] * width
+        ymax = boxes[i][2] * height
+        xmax = boxes[i][3] * width

+        cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (255, 255, 255), -1)
    return image
+
+
+def class_name_detection(class_comp_id, image, boxes, index):
+    image = crop_and_hide(image, boxes, index)
+
+    class_name = text_extraction(image)
+    if ''.join(class_name) != '':
+        if "interface" in ''.join(class_name):
+            name = ''.join(class_name).replace("<<interface>>", "")
+            comp = Component(class_answer=class_comp_id, name=name, type="interface")
+        else:
+            name = ''.join(class_name)
+            comp = Component(class_answer=class_comp_id, name=name, type="class")
+
+        db.session.add(comp)
+        db.session.commit()
+        return comp
--- a/backend/submissions/class/research_classes-Page-1.jpg
+++ b/backend/submissions/class/research_classes-Page-1.jpg