After much research, I finally got the following script (which works):
from detecto import core, utils, visualize
from detecto.visualize import show_labeled_image, plot_prediction_grid
from torchvision import transforms
import matplotlib.pyplot as plt
import numpy as np
import os
custom_transforms = transforms.Compose([
transforms.ToPILImage(),
transforms.Resize(900),
transforms.RandomHorizontalFlip(0.5),
transforms.ColorJitter(saturation=0.2),
transforms.ToTensor(),
utils.normalize_transform(),
])
Train_dataset = core.Dataset('/content/drive/MyDrive/Dataset/train/',transform=custom_transforms) # run on googlecolab
Test_dataset = core.Dataset('/content/drive/MyDrive/Dataset/test/')
loader = core.DataLoader(Train_dataset, batch_size=2, shuffle=True)
model = core.Model(['obj1', 'obj2', 'obj3', 'obj4'])
losses = model.fit(loader, Test_dataset, epochs=50, lr_step_size=5, learning_rate=0.001, verbose=True)
plt.plot(losses)
plt.show()
# Saving Model
model.save('model.pth')
## PART 2 : TESTING THE MODEL
# Loading Model
model = core.Model.load('model.pth', ['obj1', 'obj2', 'obj3', 'obj4'])
# Testing Model
image = utils.read_image('/content/drive/MyDrive/Dataset/test/test.png')
predictions = model.predict(image)
labels, boxes, scores = predictions
show_labeled_image(image, boxes, labels)
# Treshold to avoid wrong results
thresh = 0.5
filtered_indices = np.where(scores>thresh)
filtered_scores = scores[filtered_indices]
filtered_boxes = boxes[filtered_indices]
num_list = filtered_indices[0].tolist()
filtered_labels = [labels[i] for i in num_list]
show_labeled_image(image, filtered_boxes, filtered_labels)
and this script, which also works correctly:
# This script is equivalent to the PART 2 fo the previous one but using caffemodel
import sys
from imutils.video import VideoStream
from imutils.video import FPS
import numpy as np
import argparse
import imutils
import time
import cv2
import os
# Arguments construction
if len(sys.argv)==1:
args={
"prototxt": os.path.join(os.path.dirname(__file__), "MobileNetSSD_deploy.prototxt.txt"), # run on my computer
"model": os.path.join(os.path.dirname(__file__), "MobileNetSSD_deploy.caffemodel"),
"confidence":0.2,
}
else:
#lancement à partir du terminal
#python3 ObjectRecognition.py --prototxt MobileNetSSD_deploy.prototxt.txt --model MobileNetSSD_deploy.caffemodel
ap = argparse.ArgumentParser()
ap.add_argument("-p", "--prototxt", required=True,
help="path to Caffe 'deploy' prototxt file")
ap.add_argument("-m", "--model", required=True,
help="path to Caffe pre-trained model")
ap.add_argument("-c", "--confidence", type=float, default=0.2,
help="minimum probability to filter weak detections")
args = vars(ap.parse_args())
# ModelNet SSD Object list init
CLASSES = ["arriere-plan", "avion", "velo", "oiseau", "bateau",
"bouteille", "autobus", "voiture", "chat", "chaise", "vache", "table",
"chien", "cheval", "moto", "personne", "plante en pot", "mouton",
"sofa", "train", "moniteur"]
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
# Load model file
print("Load Neural Network...")
net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])
if __name__ == '__main__':
# Camera initialisation
# print("Start Camera...")
# vs = VideoStream(src=0, resolution=(1600, 1200)).start()
#vs = VideoStream(usePiCamera=True, resolution=(1600, 1200)).start()
# vs = cv2.VideoCapture('needles/hellas1.png') #from video
time.sleep(2.0)
fps = FPS().start()
#Main loop
while True:
# Get video sttream. max width 800 pixels
# frame = vs.read()
frame = cv2.imread(os.path.join(os.path.dirname(__file__), 'Dataset', 'test', 'testimg.png')) #from image file
# ret, frame = vs.read() # from video or ip cam
frame = imutils.resize(frame, width=800)
# Create blob from image
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 0.007843, (300, 300), 127.5)
# Feed input to neural network
net.setInput(blob)
detections = net.forward()
# Detection loop
for i in np.arange(0, detections.shape[2]):
# Compute Object detection probability
confidence = detections[0, 0, i, 2]
# Suppress low probability
if confidence > args["confidence"]:
# Get index and position of detected object
idx = int(detections[0, 0, i, 1])
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")
# Create box and label
label = "{}: {:.2f}%".format(CLASSES[idx],
confidence * 100)
cv2.rectangle(frame, (startX, startY), (endX, endY),
COLORS[idx], 2)
y = startY - 15 if startY - 15 > 15 else startY + 15
cv2.putText(frame, label, (startX, y),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2)
# enregistrement de l'image détectée
cv2.imwrite("detection.png", frame)
# Show video frame
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
# Exit script with letter q
if key == ord("q"):
break
# FPS update
fps.update()
# Stops fps and display info
fps.stop()
print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))
cv2.destroyAllWindows()
# vs.stop()
# vs.release()
My question is how to convert the .pth file generated by the first programm into two files .prototxt.txt and .caffemodel to link the first script to the second, or vice versa, i.e. adapt the second script to make it work with the first (I don't know which way is better, so I don't go for it and of course it doesn't work better).
I must admit that I'm at a loss when it comes to the many models on offer, and I've seen the names YOLO, TensorFlow, Onnx etc. come up, but my research has come to nothing.
I found https://github.com/WoodsGao/pytorch2caffe but README.md is not very explicit.
I hope I'm not being too messy :).
Note: [https://stackoverflow.com/questions/30902056/generate-caffemodel-file](This question) seems to be the same but it hasn't been answered.