python – How to extract sub-video from video with detect face

in this code I want to detect one face of people in the video with tracking in all frames and then extract sub video according to the time with face detection

I am working in code founded in GitHub i modify but i can not continue to modify because i have confused about it so please if you can help me

stop_videos = False
for file in files_list:
    if stop_videos:
        break

    # check if current video is not in alredy processed 
    if file in processed_files:
        print(file, 'has already been processed. Skipping it.')
        continue

    num_output_video = 0
    
    # Search for the video files in videos_directory
    video_name = file + '.mp4'
    print('Processing video:', video_name)

    if save_videos:
        # create output directory
        output_dir = os.path.join(results_dir, vids_name, file)

        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)

    # Load watson results
    with codecs.open(os.path.join(
        videos_directory, vids_name, file + '.json'),'r' , 'utf-8') as f:
        stt_results = json.load(f)

    # Extract all the words with confidence >90
    words_data = extract_words_from_watson_results(stt_results, max_words=1)

    # Start the video capture
    cap = cv2.VideoCapture(os.path.join(
        videos_directory, vids_name, video_name))

    # Extract video metadata
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    fps = cap.get(cv2.CAP_PROP_FPS)
    print('video resolution:', width, ' x ', height)
    print('video framerate:', fps)
    
    
    face_cascade = cv2.CascadeClassifier('../opencv/haarcascade_frontalface_default.xml')
    mouth_cascade = cv2.CascadeClassifier('/Users/gupsekobas/opencv_contrib-4.0.1/modules/face/data/cascades/haarcascade_mcs_mouth.xml')
    
    
    hog_face_detector = dlib.get_frontal_face_detector()

    dlib_facelandmark = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
    
    
    
    
    
    
    frame_count = 0
    fps_processing = 30.0  # fps holder
    t = cv2.getTickCount() # initiate the tickCounter
    count = 0

    for entry in words_data:
        # Extract speech to text data
        print('entry:', type(entry), entry)
        s_sec, s_millisec = divmod(float(entry['start']), 1)
        e_sec, e_millisec = divmod(float(entry['end']), 1)
        s_min = 0
        e_min = 0
        s_millisec = s_millisec * 1000
        e_millisec = e_millisec * 1000
        
        print('s_sec, s_millisec:', s_sec, s_millisec)

        if s_sec >= 60:
            s_min = math.floor(s_sec / 60.0)
            s_sec = s_sec % 60
        if e_sec >= 60:
            e_min = math.floor(e_sec / 60.0)
            e_sec = e_sec % 60

        # Determine video frames involved in stt entry
        min_frame = s_min*fps*60 + (s_sec*fps)
        max_frame = e_min*fps*60 + (e_sec*fps)

        
        
        
        # go to min_frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, min_frame)

        frame_count = min_frame
        # read frames from min_frame to max_frame
        num_people = 0
        
        valid_video = True
        #bbx1 = []
        #bby1 = []
        #bbx2 = []
        #bby2 = []
        #landmarks = []
        #angles = []
        

        consecutive_frames_no_people = 0
        while frame_count < max_frame:    
            if count == 0:
                t = cv2.getTickCount()

            # capture next frame
            ret, frame = cap.read()
            
            if not ret:
                continue
            
            
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
            #frame_small = cv2.resize(gray, (480, 640),
            #                         interpolation=cv2.INTER_LINEAR)
            
            
            #faces = hog_face_detector(frame_small)
            
            rects = face_cascade.detectMultiScale(gray, scaleFactor = 1.2, minNeighbors = 5)
            
            
            if len(rects) == 1:
                rect = dlib.rectangle(rects[0][0],rects[0][1],rects[0][2],rects[0][3])

            if len(rects) > 1:
                rects = detector(gray, 1)
                if len(rects) < 1:
                    logging.error( "ERROR: more than one face detected")
                    return
                else:
                    rect = rects[0]

            elif len(rects) < 1:
                rects = detector(gray, 1)
                if len(rects) < 1:
                    logging.error( "ERROR: no faces detected")
                    return
                else:
                    rect = rects[0]
            
            
            shape = predictor(gray, rect)  
            
            
            
            
           # for face in faces:

                
                #face_landmarks = dlib_facelandmark(frame_small, face)
                #x_pts = []
                #y_pts = []
                
               
                #for n in range(0, 68):
                  #  x = face_landmarks.part(n).x
                #    y = face_landmarks.part(n).y
                    
                 #   x_pts.append(x)
                 #   y_pts.append(y)
                    
                   
                #    cv2.circle(frame_small, (x, y), 1, (0, 255, 255), 1)
                    
               # num_people += 1   
            

                
          #  frame_count += 1

            # resize frame for faster processing
            #if frame.shape[0] <= 0 or frame.shape[1] <= 0:
            #    continue
                


            # detect faces and landmarjs
            #fa.update_features(frame_small)

            #landmarks.append(fa.get_mouth_features(scale=scale))
            #num_people = 
            #angles.append(fa.get_yaw())
            
            #bounding_boxes, landmarks = detect_faces(frame_small)
            #num_people = bounding_boxes.shape[0]

            #bounding_boxes /= scale
            #landmarks /= scale

            # if it detects less than or more than 1 person
            # go to next subtitle
            if num_people != 1:                    
                consecutive_frames_no_people += 1
                
            if consecutive_frames_no_people >= max_bad_frames:
                print(consecutive_frames_no_people,
                    ' frames without 1 person. Skiping to next subtitle')
                valid_video = False
                break
            
            # if only one person in the scene
            if num_people == 1:
                consecutive_frames_no_people = 0
                
                #face_landmarks = dlib_facelandmark(gray, face)

                #for n in range(0, 68):
                #    x = face_landmarks.part(n).x
                #    y = face_landmarks.part(n).y
                    
                    
                    
                #    cv2.circle(frame, (x, y), 1, (0, 255, 255), 1)
                
                
                x1 = min(x_pts)
                x2 = max(x_pts)
                y1 = min(y_pts)
                y2 = max(y_pts)

                #cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 3)
                
                
                # extract the bounding box
                #bb = bounding_boxes[0]
                #x1, y1 = int(bb[0]), int(bb[1])
                #x2, y2 = int(bb[2]), int(bb[3])

                #area = (x2 - x1) * (y2 - y1)
                #if area < min_area:
                #    valid_video = False
                #    break

                # save bounding box coordinates for final crop
                #bbx1.append(x1)
                #bbx2.append(x2)
                #bby1.append(y1)
                #bby2.append(y2)

                # draw the bounding box and landmarks on original frame
                #frame = show_bboxes(frame, bounding_boxes, landmarks)

                # Put fps at which we are processing camera feed on frame
                cv2.putText(frame_small, "{0:.2f}-fps".format(fps_processing),
                            (50, height-50), cv2.FONT_HERSHEY_COMPLEX,
                            1, (0, 0, 255), 2)

            # Display the image
            cv2.imshow('Vid',frame_small)
        
            # Read keyboard and exit if ESC was pressed
            k = cv2.waitKey(1) & 0xFF
            if k ==27:
                exit()
            elif k == ord('q'):
                stop_videos = True

            # increment frame counter
            count = count + 1
            # calculate fps at an interval of 100 frames
            if (count == 30):
                t = (cv2.getTickCount() - t)/cv2.getTickFrequency()
                fps_processing = 30.0/t
                count = 0

        # if this was a valid video
        #if valid_video and len(landmarks) > 0:
        #    num_output_video += 1
        if valid_video and len(x1) > 0:
            num_output_video += 1

            # get final coordinates
            #bbx1 = np.amin(np.array(bbx1))
            #bbx2 = np.amax(np.array(bbx2))
            #bby1 = np.amin(np.array(bby1))
            #bby2 = np.amax(np.array(bby2))
            bbw = x2 - x1
            bbh = y2 - y1

            entry['bounding_box'] = [x1, y1, bbw, bbh]
            print('entry:', type(entry), entry)

            if save_videos:
                s_hr = 0
                e_hr = 0
                if s_min >= 60:
                    s_hr = math.floor(s_min / 60)
                    s_min = s_min % 60
                if e_min >= 60:
                    e_hr = math.floor(e_min / 60)
                    e_min = e_min % 60

                # cut and crop video
                # ffmpeg -i input.mp4 -ss hh:mm:ss -filter:v crop=w:h:x:y -c:a copy -to hh:mm:ss output.mp4
                ss = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
                    s_hr, s_min, int(s_sec), math.ceil(s_millisec))
                es = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
                    e_hr, e_min, int(e_sec), math.ceil(e_millisec))
                crop = "crop={0:1d}:{1:1d}:{2:1d}:{3:1d}".format(
                    bbw, bbh, x1, y1)

                out_name = os.path.join(output_dir, str(num_output_video))

                subprocess.call(['ffmpeg', #'-hide_banner', '-loglevel', 'panic',
                                '-i', os.path.join(
                                videos_directory, vids_name, video_name),
                                '-ss', ss,
                                '-filter:v', crop, '-c:a', 'copy',
                                '-to', es, out_name +'.mp4'])
                                        # save recognized speech
                text_file = open(out_name +'.txt', "w")
                text_file.write(entry['text'] + 'n')
                text_file.write(str(entry['conf']))
                text_file.close()
                
                
                
                
    # delete the entries without bounding box
    words_data[:] = [dic for dic in words_data
                    if len(dic['bounding_box']) > 0]            

    # append results to annotation file
    append_annotation_file(os.path.join(
        results_dir, dataset_annotation_file), words_data)

    # save name of processed file
    processed_files.append(file)
    with open(os.path.join(results_dir, vid_proc_name), "w") as fp:
        for p_file in processed_files:
            print(p_file, file=fp)

    
# Release resources
#cap.release()
cv2.destroyAllWindows()

this is the link of github https://github.com/jrterven/audio-visual-dataset/blob/master/extract_subvideos.py

Leave a Comment