Object Detection using Faster RCNN and SORT
Object recognition techniques are studied increasingly due to their applications in video analytics and understanding images. Traditional methods include handcrafted feature generation and shallow networks for training. Deep learning architectures can learn high-level deeper features and thus showed better performance compared to the traditional methods. Object segmentation and tracking methods are widely researched in Computer Vision community due to their vast range of applications. The applications include face detection, medical imaging, video-based surveillance systems, self-driving vehicles, etc. A number of challenges related to obstruction, motion blur, deformation need to be handled while solving problems. Heterogenous objects, interacting objects make it difficult to segment and track objects. Quick movement handling and real time processing are required in some applications which require design of application specific algorithms.
Simple online and realtime tracking is simple tracking algorithm that performs Kalman filtering and a method to measure bounding box overlap. It can track multiple objects in realtime. It associates the detected objects across frames. A detection algorithm is used whose results are used by SORT algorithm to match the detected objects in subsequent frames. Each box has an object id and SORT associates the objects in different frames using simple heuristics like maximizing Intersection over Union between boxes in subsequent frames.
Problem Statement
Use Faster RCNN and SORT for object detection and tracking and design a computer vision application to detect objects in people’s hands from videos with applications in surveillance systems, robotics and inventory management system.
Create dataset
Videos of person capturing objects were collected to use for training and testing. Videos captured with cameras at different angles were collected. Dataset includs frames extracted from videos and for every frame annotation including the name of sample, class and four bounding box coordinates captured using OpenCV.
Import libraries
from __future__ import print_function
import os
import sys
import utils
import argparse
import numpy as np
from PIL import Image
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchsummary import summary
os.system("git clone https://github.com/abewley/sort.git")
import transforms as T
from engine import train_one_epoch, evaluate
import cv2
import matplotlib.pyplot as plt
import psutil
import sort
from sort import *
Create Object dataset
class objDataset(object):
def __init__(self, root, transforms, classes=None):
self.root = root
self.transforms = transforms
self.classes = classes
# load all images and dicts
all_imgs = list(sorted(os.listdir(os.path.join(root, "images"))))
all_dicts = list(sorted(os.listdir(os.path.join(root, "dicts"))))
self.imgs = []
self.dicts = []
idx = 0
for sd in all_dicts:
for si in all_imgs:
if sd[:-4]==si[:-4]:
def __getitem__(self, idx):
# Get image and dict path
img_path = os.path.join(self.root, "images", self.imgs[idx])
dict_path = os.path.join(self.root, "dicts", self.dicts[idx])
# Open image
img = Image.open(img_path).convert("RGB")
# Open corresponding dictionary
f = open(dict_path, 'r')
lines = f.readlines()
# Get bounding box coordinates
box = [int(s) for s in lines[-1].split()]
# Get bounding box label
label = lines[len(lines)-2].replace('\n','')
boxes = torch.as_tensor([[box[0], box[1], box[0]+box[2], box[1]+box[3]]], dtype=torch.float32)
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# Get a list of classes
label1 = args.classes.split(",")
while (i!=0):
labels = (label1.index(label)+1)*torch.ones((1,), dtype=torch.int64)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((1,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
Return predicted boxes and labels for each object
def get_prediction(model, img_path, threshold, classes, device=device):
# First label is background
for i in label:
img = Image.open(img_path)
transform = T.Compose([T.ToTensor()])
img = transform(img)
# Get prediction
with torch.no_grad():
pred = model([img.to(device)])
pred_score = list(pred[0]['scores'].detach().cpu().numpy())
# Get prediction with score above threshold
pred_t = [pred_score.index(x) for x in pred_score if x>threshold]
if pred_t:
return [],[]
# Return class and box coordinates
pred_class = [LABELS[i] for i in list(pred[0]['labels'].cpu().numpy())]
pred_boxes = [[(i[0], i[1]), (i[2], i[3])] for i in list(pred[0]['boxes'].detach().cpu().numpy())]
if(pred_t is []):
return [],[]
pred_boxes = pred_boxes[:pred_t+1]
pred_class = pred_class[:pred_t+1]
return pred_boxes,pred_class
Required for SORT as SORT input is in YOLO output form
def to_yolo_form(pred, pred_t):
scores = pred[0]['scores'][:pred_t+1].unsqueeze(1)
return torch.cat([pred[0]['boxes'][:pred_t+1,:], scores, scores, pred[0]['labels'][:pred_t+1].float().unsqueeze(1)],1)
Get fasterRCNN object from torchvision models
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
classes = args.classes
# Create list of classes
labels = classes.split(",")
num_classes = len(labels)+1 # N class + background
# Get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# Replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
Train the model
# Set root - Folder in which images and dists are present
root_path = '.'
dataset = objDataset(root_path, get_transform(train=True), labels)
dataset_test = objDataset(root_path, get_transform(train=False),labels)
indices = torch.randperm(len(dataset)).tolist()
# Split in train and test
#TODO modify the split as per the number of examples
dataset = torch.utils.data.Subset(dataset, indices[:500])
dataset_test = torch.utils.data.Subset(dataset_test, indices[500:600])
# Define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size, shuffle=True, num_workers=0,
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=args.batch_size_test, shuffle=False, num_workers=0,
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = args.epochs
for epoch in range(num_epochs):
# Train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
# Update the learning rate
torch.save(model.state_dict(), 'fasterRNN-hand.pt')