Deep interesting | 11 TensorFlow object detection

Introduction to the

TensorFlow provides an API for detecting objects contained in a picture or video. See the link below for details

Github.com/tensorflow/…

Object detection is different from image classification

Picture classification is to divide the picture into a certain category, that is, to choose one from multiple possible categories. Even if the most likely multiple categories can be output according to the probability, the correct answer in theory is only one
Object detection is to detect all the objects in the picture and mark them with the rectangle Anchor Box. The categories of objects can include many kinds, such as people, cars, animals, road signs, etc., that is, the correct answer can be multiple

Learn how to use the TensorFlow object detection API through several examples

A pre-trained SSD_Mobilenet_v1_COCO model (Single Shot MultiBox Detector) is used here. More available object detection models can be found here

Github.com/tensorflow/…

For example

Load the library

# -*- coding: utf-8 -*-

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image

from utils import label_map_util
from utils import visualization_utils as vis_util
Copy the code

Define some constants

PATH_TO_CKPT = 'ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb'
PATH_TO_LABELS = 'ssd_mobilenet_v1_coco_2017_11_17/mscoco_label_map.pbtxt'
NUM_CLASSES = 90
Copy the code

Load the pre-trained model

detection_graph = tf.Graph()
with detection_graph.as_default():
	od_graph_def = tf.GraphDef()
	with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
		od_graph_def.ParseFromString(fid.read())
		tf.import_graph_def(od_graph_def, name=' ')
Copy the code

Load category label data

label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
Copy the code

An auxiliary function to convert images into arrays, and test the image path

def load_image_into_numpy_array(image):
	(im_width, im_height) = image.size
	return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)
	
TEST_IMAGE_PATHS = ['test_images/image1.jpg'.'test_images/image2.jpg']
Copy the code

Object detection using models

with detection_graph.as_default():
	with tf.Session(graph=detection_graph) as sess:
	    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
	    detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
	    detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
	    detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
	    num_detections = detection_graph.get_tensor_by_name('num_detections:0')
	    for image_path in TEST_IMAGE_PATHS:
	    	image = Image.open(image_path)
	    	image_np = load_image_into_numpy_array(image)
	    	image_np_expanded = np.expand_dims(image_np, axis=0)
	    	(boxes, scores, classes, num) = sess.run(
	    		[detection_boxes, detection_scores, detection_classes, num_detections], 
	    		feed_dict={image_tensor: image_np_expanded})
	    	
	    	vis_util.visualize_boxes_and_labels_on_image_array(image_np, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True, line_thickness=8)
	    	plt.figure(figsize=[12, 8])
	    	plt.imshow(image_np)
	    	plt.show()
Copy the code

The results are as follows. The first image detected two dogs

The second image detected some people and kites

Camera detection

Install OpenCV for computer vision-related functionality, version 3.3.0.10

pip install opencv-python opencv-contrib-python -i https://pypi.tuna.tsinghua.edu.cn/simple
Copy the code

Check whether the installation is successful and no error is reported

import cv2
tracker = cv2.TrackerMedianFlow_create()
Copy the code

Make changes based on the above code

loadingcv2And get the camera
Constantly taking pictures from the camera
Output the result after detection

The complete code is as follows

# -*- coding: utf-8 -*-

import numpy as np
import tensorflow as tf

from utils import label_map_util
from utils import visualization_utils as vis_util

import cv2
cap = cv2.VideoCapture(0)

PATH_TO_CKPT = 'ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb'
PATH_TO_LABELS = 'ssd_mobilenet_v1_coco_2017_11_17/mscoco_label_map.pbtxt'
NUM_CLASSES = 90

detection_graph = tf.Graph()
with detection_graph.as_default():
	od_graph_def = tf.GraphDef()
	with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
		od_graph_def.ParseFromString(fid.read())
		tf.import_graph_def(od_graph_def, name=' ')

label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

with detection_graph.as_default():
	with tf.Session(graph=detection_graph) as sess:
	    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
	    detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
	    detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
	    detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
	    num_detections = detection_graph.get_tensor_by_name('num_detections:0')
	    while True:
	    	ret, image_np = cap.read()
	    	image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
	    	image_np_expanded = np.expand_dims(image_np, axis=0)
	    	(boxes, scores, classes, num) = sess.run(
	    		[detection_boxes, detection_scores, detection_classes, num_detections], 
	    		feed_dict={image_tensor: image_np_expanded})
	    	
	    	vis_util.visualize_boxes_and_labels_on_image_array(image_np, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True, line_thickness=8)
	    	
	    	cv2.imshow('object detection', cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
	    	if cv2.waitKey(25) & 0xFF == ord('q'):
	    		cap.release()
	    		cv2.destroyAllWindows()
	    		break
Copy the code

Video detection

Cv2 is used to read the video and get each frame of the picture, and then each frame after detection is written to a new video file

The generated video file has only images and no sound. Audio processing and video and audio synthesis will be explored later

The complete code is as follows

# -*- coding: utf-8 -*-

import numpy as np
import tensorflow as tf

from utils import label_map_util
from utils import visualization_utils as vis_util

import cv2
cap = cv2.VideoCapture('Jedi Escape.mov')
ret, image_np = cap.read()
out = cv2.VideoWriter('output.mov', -1, cap.get(cv2.CAP_PROP_FPS), (image_np.shape[1], image_np.shape[0]))

PATH_TO_CKPT = 'ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb'
PATH_TO_LABELS = 'ssd_mobilenet_v1_coco_2017_11_17/mscoco_label_map.pbtxt'
NUM_CLASSES = 90

detection_graph = tf.Graph()
with detection_graph.as_default():
	od_graph_def = tf.GraphDef()
	with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
		od_graph_def.ParseFromString(fid.read())
		tf.import_graph_def(od_graph_def, name=' ')

label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

with detection_graph.as_default():
	with tf.Session(graph=detection_graph) as sess:
	    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
	    detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
	    detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
	    detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
	    num_detections = detection_graph.get_tensor_by_name('num_detections:0')
	    while cap.isOpened():
	    	ret, image_np = cap.read()
	    	if len((np.array(image_np)).shape) == 0:
	    		break

	    	image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
	    	image_np_expanded = np.expand_dims(image_np, axis=0)
	    	
	    	(boxes, scores, classes, num) = sess.run(
	    		[detection_boxes, detection_scores, detection_classes, num_detections], 
	    		feed_dict={image_tensor: image_np_expanded})
	    	
	    	vis_util.visualize_boxes_and_labels_on_image_array(image_np, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True, line_thickness=8)
	    	out.write(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
	    	
cap.release()
out.release()
cv2.destroyAllWindows()
Copy the code

Playing the processed video, you can see the corresponding detection results in many places

reference

The Introduction and Use – Tensorflow Object Detection API Tutorial:pythonprogramming.net/introductio…
Tensorflow Object Detection API: github.com/tensorflow/…
SSD – Single Shot MultiBox Detector: arxiv.org/pdf/1512.02…

Video lecture course

Deep and interesting (1)

Deep interesting | 11 TensorFlow object detection

Introduction to the

For example

Camera detection

Video detection

reference

Video lecture course

Related Posts

Is it still worth building a visual search engine from scratch?

Image fusion based on MATLAB GUI SIFT+ wavelet transform image Mosaic fusion system

Analysis of classical Network structure – AlexNet by Deep Network Designer