python – Video Lagging while Object Detection on C++

There is an object detection pre-trained model ie Yolov3/v4-tiny, when the algorithm is implemented in python, looked good, there is no lag while processing the video when displaying it over “Imshow”.

import cv2
import numpy as np
import time
#net = cv2.dnn.readNet("yolov3.weights","yolov3.cfg") # Original yolov3
net = cv2.dnn.readNet("yolov4-tiny.weights","yolov4-tiny.cfg") #Tiny Yolo
classes = []
with open("coco.names","r") as f:
    classes = [line.strip() for line in f.readlines()]

# print(classes)
layer_names = net.getLayerNames()
# print(layer_names)

outputlayers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
print(outputlayers)
colors= np.random.uniform(0,255,size=(len(classes),3))
# print(colors)
cap=cv2.VideoCapture(0) #0 for 1st webcam
cap.set(cv2.CAP_PROP_FPS, 30)
cap.set(cv2.CAP_PROP_BUFFERSIZE, 2)
font = cv2.FONT_HERSHEY_PLAIN

frame_id = 0

while True:
    starting_time= time.time()
    _,frame= cap.read() # 
    frame_id+=1
    
    height,width,channels = frame.shape
    #detecting objects

    blob = cv2.dnn.blobFromImage(frame,1/127.0 ,(512,512),(0,0,0),True,crop=False) #reduce 416 to 320
    net.setInput(blob)
    outs = net.forward(outputlayers)
    #print(outs[1])

    #Showing info on screen/ get confidence score of algorithm in detecting an object in blob
    class_ids=[]
    confidences=[]
    boxes=[]
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.4:
                #object detected
                center_x= int(detection[0]*width)
                center_y= int(detection[1]*height)
                w = int(detection[2]*width)
                h = int(detection[3]*height)

                #cv2.circle(img,(center_x,center_y),10,(0,255,0),2)
                #rectangle co-ordinaters
                x=int(center_x - w/2)
                y=int(center_y - h/2)
                #cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)

                boxes.append([x,y,w,h]) #put all rectangle areas
                confidences.append(float(confidence)) #how confidence was that object detected and show that percentage
                class_ids.append(class_id) #name of the object that was detected

    indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.6)

    for i in range(len(boxes)):
        if i in indexes:
            x,y,w,h = boxes[i]
            label = str(classes[class_ids[i]])
            confidence= confidences[i]
            color = colors[class_ids[i]]
            cv2.rectangle(frame,(x,y),(x+w,y+h),color,2)
            cv2.putText(frame,label+"("+str(round(confidence*100))+"%)",(x,y-1),font,1,color,2)
            

    elapsed_time = time.time() - starting_time
    print(elapsed_time)
    fps=frame_id/elapsed_time
    cv2.putText(frame,"FPS:"+str(round(fps,2)),(10,50),font,2,(0,0,0),1)
    # cv2.imshow("Image",frame)
    key = cv2.waitKey(1) #wait 1ms the loop will start again and we will process the next frame
    
    if key == 27: #esc key stops the process
        break
    
cap.release()    
cv2.destroyAllWindows()

but when I implemented the same in C++ the execution time increased and there is a huge lag on the video which is not real-time.

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <opencv2/core.hpp>
#include <opencv2/core/ocl.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <chrono>
#include <time.h>
using namespace std;
using namespace cv;
using namespace dnn;
using namespace chrono;
int main()
{
    vector<string> classes;
    string classesFile = "F:/coco.names";
    Net net = readNet("F:/yolov4-tiny.weights","F:/yolov4-tiny.cfg");
    ifstream ifs(classesFile.c_str());
    string line;
    while(getline(ifs,line)) classes.push_back(line);

    vector<string> layer_names = net.getLayerNames();
    vector<string> outputlayers;
    vector<int> unconnectedlayer = net.getUnconnectedOutLayers();
    for(int i=0;i<unconnectedlayer.size();i++)
        outputlayers.push_back(layer_names[unconnectedlayer[i]-1]);
    //check till above
    VideoCapture cap(0);
    cap.set(CAP_PROP_FPS,30);
    cap.set(CAP_PROP_BUFFERSIZE,2);
    Mat frame, blob;
    time_t start, end;
    while(1){
        time(&start);
        cap.read(frame);
        blobFromImage(frame, blob, 1/255.0,Size(512,512),Scalar(0,0,0),true,false);
        net.setInput(blob);
        vector<Mat> outs;
        net.forward(outs, outputlayers);
        int framewidth=frame.cols;
        int frameheight=frame.rows;
        vector<int> class_ids;
        vector<float> confidences;
        vector<Rect> boxes;

        for (size_t i = 0; i < outs.size(); ++i)
        {
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
        {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
                if (confidence > 0.4)
                {
                    int centerX = (int)(data[0] * framewidth);
                    int centerY = (int)(data[1] * frameheight);
                    int width = (int)(data[2] * framewidth);
                    int height = (int)(data[3] * frameheight);
                    int left = centerX - width / 2;
                    int top = centerY - height / 2;
                    class_ids.push_back(classIdPoint.x);
                    confidences.push_back((float)confidence);
                    boxes.push_back(Rect(left,top,width,height));
                }
        }
       }
       vector<int> indices;
       NMSBoxes(boxes, confidences, 0.4F, 0.6F, indices);
           for (size_t i = 0; i < indices.size(); ++i)
           {
               int idx = indices[i];
               Rect box = boxes[idx];
//               float confidence=confidences[idx];
               rectangle(frame,box,Scalar(255,0,0),2);
               putText(frame,classes[class_ids[i]],Point(box.x,box.y-1),FONT_HERSHEY_PLAIN,1,Scalar(255,0,0),2);
           }
        time(&end);
        double timetaken = (double)end - (double)start ;
        cout<<fixed<<timetaken<<endl;
        imshow("Image",frame);
        int c = waitKey(27);
        if((char)c == 'c')
                break;
}
    cap.release();
    destroyAllWindows();
}

So, then I moved ahead and, On further debugging, I found out that the python application is using Gpu-3d and C++ application is using gpu-videoprocessing. Now I’m clueless about how these both things are working differently. when the code is almost same.

Leave a Comment