UI files

da52bd85 · Julien Angelo Yoganathan · 15fb6054 · da52bd85 · da52bd85 · da52bd85
Commit da52bd85 authored May 27, 2022 by Julien Angelo Yoganathan
22 changed files
--- a/LICENSE
+++ b/LICENSE
--- a/README.md
+++ b/README.md
-# 2022-303
-
-Research Project on 'Real-Time Translation of Sinhala Sign Language to Sinhala Text'
-
-**Main Objective**
-Bridging the communication barrier between the hearing impaired/ aurally handicapped community and the general public
-
-**Main Research questions**
-1. How do preprocess the video and determine the sign type on a real-time basis
-2. How to classify static and dynamic gestures in real-time
-3. How to generate a meaningful Sinhala text based on the classified gestures
-4. How to handle uninterpreted gestures
-
-**Individual research question**
-1.  Video Preprocessing and Sign Type Determination
-        How to identify key moments of a sign in a real-time video feed ? 
-        How to identify hands in a given image frame ?
-        How to detect whether the sign is a static or a dynamic?
-
-2. Static Gesture classifier
-        How to create a efficient dataset?
-        How to train the model for accurate and efficient communication?
-        How to ensure accurate sign prediction and classification?
-        How to handle uninterpreted gestures?
-
-3. Dynamic Gesture classifier
-        How to create a efficient dataset?
-        How to train the model for accurate and efficient communication?
-        How to ensure accurate video classification?
-        How to handle uninterpreted gestures?
-
-4. Generate Sinhala Text
-        How to generate Sinhala text ?
-        How to make a sentence ?
-        How to check grammar ?
-        What is the best way to give correct output ?
-
-**Individual Objectives**
-
-1.  Video Preprocessing and Sign Type Determination : Identify signs in a real-time video feed and detect whether the sign is a static sign or a dynamic sign.
-
-2. Static Gesture classifier : Real-time classification of static gestures in order to provide an accurate interpretation and an appropriate prediction for the recognized sign
-
-3. Dynamic Gesture classifier : Real-time identification of dynamic gestures with optimum accuracy
-        
-4. Generate Sinhala Text : Display correct Sinhala sentence or word for the signs.
-
-**Other necessary information**
-Technology Stack
-*     Python
-*     TensorFlow
-*     Keras
-*     OpenCV
-*     Google Colaboratory
-*     Flutter
+# YOLO-v3-Object-Detection
+YOLO - You Only Look Once
+This repository has the code for YOLO v3 Object detection, and is capable of fast object detection. Input can be given through images, videos and webcam input feed.

+View a demo of the project at: https://www.youtube.com/watch?v=iT1yJTk77CQ
--- a/__pycache__/yolo_detection_images.cpython-310.pyc
+++ b/__pycache__/yolo_detection_images.cpython-310.pyc
--- a/app.py
+++ b/app.py
+from flask import Flask,jsonify,request
+from yolo_detection_images import detectObjects
+
+app=Flask(__name__)
+
+@app.route('/myapp/detectObjects')
+def detect():
+    img=request.args['image']
+    img_path='images/'+img
+    results=detectObjects(img_path)
+    return jsonify(results)
+
+app.run()
\ No newline at end of file
--- a/cfg/yolov3.cfg
+++ b/cfg/yolov3.cfg
+[net]
+# Testing
+# batch=1
+# subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=608
+height=608
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
--- a/cfg/yolov4-obj.cfg
+++ b/cfg/yolov4-obj.cfg
--- a/coco.names
+++ b/coco.names
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/images/ayubowan_test.jpg
+++ b/images/ayubowan_test.jpg
--- a/images/ayubowan_v_0.jpg
+++ b/images/ayubowan_v_0.jpg
--- a/images/bad_v_3.jpg
+++ b/images/bad_v_3.jpg
--- a/images/dog.jpg
+++ b/images/dog.jpg
--- a/images/eagle.jpg
+++ b/images/eagle.jpg
--- a/images/giraffe.jpg
+++ b/images/giraffe.jpg
--- a/images/horses.jpg
+++ b/images/horses.jpg
--- a/images/kite.jpg
+++ b/images/kite.jpg
--- a/images/person.jpg
+++ b/images/person.jpg
--- a/images/scream.jpg
+++ b/images/scream.jpg
--- a/images/test.jpeg
+++ b/images/test.jpeg
--- a/obj.names
+++ b/obj.names
+Ayubowan
+Good
+Bad
+House
+Love
+One
+You
\ No newline at end of file
--- a/yolo_detection_images.py
+++ b/yolo_detection_images.py
+import numpy as np
+import cv2
+
+def detectObjects(img_path):
+    confidenceThreshold = 0.5
+    NMSThreshold = 0.3
+
+    modelConfiguration = 'cfg/yolov4-obj.cfg'
+    modelWeights = 'yolov4-obj_final.weights'
+
+    labelsPath = 'obj.names'
+    labels = open(labelsPath).read().strip().split('\n')
+
+    np.random.seed(10)
+    COLORS = np.random.randint(0, 255, size=(len(labels), 3), dtype="uint8")
+
+    net = cv2.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
+
+    image = cv2.imread(img_path)
+    (H, W) = image.shape[:2]
+
+    #Determine output layer names
+    layerName = net.getLayerNames()
+    layerName = [layerName[i - 1] for i in net.getUnconnectedOutLayers()]
+
+    blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), swapRB = True, crop = False)
+    net.setInput(blob)
+    layersOutputs = net.forward(layerName)
+
+    boxes = []
+    confidences = []
+    classIDs = []
+
+    for output in layersOutputs:
+        for detection in output:
+            scores = detection[5:]
+            classID = np.argmax(scores)
+            confidence = scores[classID]
+            if confidence > confidenceThreshold:
+                box = detection[0:4] * np.array([W, H, W, H])
+                (centerX, centerY,  width, height) = box.astype('int')
+                x = int(centerX - (width/2))
+                y = int(centerY - (height/2))
+
+                boxes.append([x, y, int(width), int(height)])
+                confidences.append(float(confidence))
+                classIDs.append(classID)
+
+    #Apply Non Maxima Suppression
+    detectionNMS = cv2.dnn.NMSBoxes(boxes, confidences, confidenceThreshold, NMSThreshold)
+
+    outputs={}
+
+    if len(detectionNMS)>0:
+        outputs['detections']={}
+        outputs['detections']['labels']=[]
+        for i in detectionNMS.flatten():
+            detection={}
+            detection['Label']=labels[classIDs[i]]
+            detection['Confidence']=confidences[i]
+            detection['X']=boxes[i][0]
+            detection['Y']=boxes[i][1]
+            detection['Width']=boxes[i][2]
+            detection['Height']=boxes[i][2]
+            outputs['detections']['labels'].append(detection)
+
+    else:
+            outputs['detections']='No object detected'
+
+    return outputs 
\ No newline at end of file
--- a/yolo_detection_video.py
+++ b/yolo_detection_video.py
+import numpy as np
+import cv2
+
+confidenceThreshold = 0.5
+NMSThreshold = 0.3
+
+modelConfiguration = 'cfg/yolov3.cfg'
+modelWeights = 'yolov3.weights'
+
+labelsPath = 'coco.names'
+labels = open(labelsPath).read().strip().split('\n')
+
+np.random.seed(10)
+COLORS = np.random.randint(0, 255, size=(len(labels), 3), dtype="uint8")
+
+net = cv2.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
+
+outputLayer = net.getLayerNames()
+outputLayer = [outputLayer[i[0] - 1] for i in net.getUnconnectedOutLayers()]
+
+video = cv2.VideoCapture('chase.mp4')
+writer = None
+(W, H) = (None, None)
+
+try:
+    prop = cv2.CAP_PROP_FRAME_COUNT
+    total = int(video.get(prop))
+    print("[INFO] {} total frames in video".format(total))
+except:
+    printf("Could not determine no. of frames in video")
+
+count = 0
+while True:
+    (ret, frame) = video.read()
+    if not ret:
+        break
+    if W is None or H is None:
+        (H,W) = frame.shape[:2]
+
+    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB = True, crop = False)
+    net.setInput(blob)
+    layersOutputs = net.forward(outputLayer)
+
+    boxes = []
+    confidences = []
+    classIDs = []
+
+    for output in layersOutputs:
+        for detection in output:
+            scores = detection[5:]
+            classID = np.argmax(scores)
+            confidence = scores[classID]
+            if confidence > confidenceThreshold:
+                box = detection[0:4] * np.array([W, H, W, H])
+                (centerX, centerY,  width, height) = box.astype('int')
+                x = int(centerX - (width/2))
+                y = int(centerY - (height/2))
+
+                boxes.append([x, y, int(width), int(height)])
+                confidences.append(float(confidence))
+                classIDs.append(classID)
+
+    #Apply Non Maxima Suppression
+    detectionNMS = cv2.dnn.NMSBoxes(boxes, confidences, confidenceThreshold, NMSThreshold)
+    if(len(detectionNMS) > 0):
+        for i in detectionNMS.flatten():
+            (x, y) = (boxes[i][0], boxes[i][1])
+            (w, h) = (boxes[i][2], boxes[i][3])
+
+            color = [int(c) for c in COLORS[classIDs[i]]]
+            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
+            text = '{}: {:.4f}'.format(labels[classIDs[i]], confidences[i])
+            cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+            if writer is None:
+                fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+                writer = cv2.VideoWriter('chase_output.avi', fourcc, 30, (frame.shape[1], frame.shape[0]), True)
+    if writer is not None:
+        writer.write(frame)
+        print("Writing frame" , count+1)
+        count = count + 1
+
+writer.release()
+video.release()
--- a/yolo_detection_webcam.py
+++ b/yolo_detection_webcam.py
+import numpy as np
+import cv2
+
+confidenceThreshold = 0.5
+NMSThreshold = 0.3
+
+modelConfiguration = 'cfg/yolov3.cfg'
+modelWeights = 'yolov3.weights'
+
+labelsPath = 'coco.names'
+labels = open(labelsPath).read().strip().split('\n')
+
+np.random.seed(10)
+COLORS = np.random.randint(0, 255, size=(len(labels), 3), dtype="uint8")
+
+net = cv2.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
+
+outputLayer = net.getLayerNames()
+outputLayer = [outputLayer[i[0] - 1] for i in net.getUnconnectedOutLayers()]
+
+video_capture = cv2.VideoCapture(0)
+
+(W, H) = (None, None)
+
+while True:
+    ret, frame = video_capture.read()
+    frame = cv2.flip(frame, 1)
+    if W is None or H is None:
+        (H,W) = frame.shape[:2]
+
+    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB = True, crop = False)
+    net.setInput(blob)
+    layersOutputs = net.forward(outputLayer)
+
+    boxes = []
+    confidences = []
+    classIDs = []
+
+    for output in layersOutputs:
+        for detection in output:
+            scores = detection[5:]
+            classID = np.argmax(scores)
+            confidence = scores[classID]
+            if confidence > confidenceThreshold:
+                box = detection[0:4] * np.array([W, H, W, H])
+                (centerX, centerY,  width, height) = box.astype('int')
+                x = int(centerX - (width/2))
+                y = int(centerY - (height/2))
+
+                boxes.append([x, y, int(width), int(height)])
+                confidences.append(float(confidence))
+                classIDs.append(classID)
+
+    #Apply Non Maxima Suppression
+    detectionNMS = cv2.dnn.NMSBoxes(boxes, confidences, confidenceThreshold, NMSThreshold)
+    if(len(detectionNMS) > 0):
+        for i in detectionNMS.flatten():
+            (x, y) = (boxes[i][0], boxes[i][1])
+            (w, h) = (boxes[i][2], boxes[i][3])
+
+            color = [int(c) for c in COLORS[classIDs[i]]]
+            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
+            text = '{}: {:.4f}'.format(labels[classIDs[i]], confidences[i])
+            cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+    cv2.imshow('Output', frame)
+    if(cv.waitKey(1) & 0xFF == ord('q')):
+        break
+
+#Finally when video capture is over, release the video capture and destroyAllWindows
+video_capture.release()
+cv2.destroyAllWindows()