• date, 2018-11-07 14:48:57

    Data collection & labeling

  1. Use your own way to collect your data, usually the size of image doesn’t matter. It’ll better to be fit in (48, 48) ~ (1280, 720)
  2. When you finished your own dataset, you should label your images.

Install Darknet

  1. Darknet Installation , compile with GPU and Opencv if it’s necessary

Create VOC format dataset

  • (1) In the root of darknet, create a folder names ‘VOCdevkit’, and create a folder names what you want to name your dataset. like ‘VOC2019_oppo’, which has to start with ‘VOC’.

    1
    2
    3
    4
    cd /path/darknet
    mkdir VOCdevkit
    cd VOCdevkit
    mkdir VOC2019_oppo
  • (2) Directory like this :

    1
    2
    3
    4
    5
    6
    └── VOCdevkit
    └── VOC2019_oppo
    ├── Annotations
    ├── ImageSets
    │   └── Main
    └── JPEGImages
  • (3) Move the images into JPEGImages and xml files into Annotations.

  • (4) Split the train, val and test, create a py script like belows

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    ## split_train_val.py
    import os,random

    # read the filenames from a file
    dirname = './Annotations'
    files = [f[:-4] for f in os.listdir(dirname) if f[-4:].lower() == '.xml']

    # random divide
    trainval = random.sample(files, len(files)//2)
    test = [f for f in files if f not in trainval]

    # random divide
    train = random.sample(trainval, len(trainval)//2)
    val = [f for f in trainval if f not in train]

    # save to txt file
    def list2txt(arr, fname):
    with open(fname+'.txt', 'w') as f:
    for a in arr:
    f.write(a+'\n')

    list2txt(trainval, 'trainval')
    list2txt(test, 'test')
    list2txt(train, 'train')
    list2txt(val, 'val')
  • then run the script, you will get four files, then move them into the ImageSets/Main/

    1
    2
    3
    4
    5
    6
    python split_train_val.py

    mv test.txt ImageSets/Main/
    mv train.txt ImageSets/Main/
    mv trainval.txt ImageSets/Main/
    mv val.txt ImageSets/Main/
  • (5) Now you have the directory like this

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    └── VOCdevkit
    └── VOC2019_oppo
    ├── Annotations
    ├── ImageSets
    │   └── Main
    │   ├── test.txt
    │   ├── train.txt
    │   ├── trainval.txt
    │   └── val.txt
    ├── JPEGImages
    └── split_train_val.py

Use the voc_labe to generate Image path list

1
2
3
cd /path/darknet
touch voc_label.py
vim voc_label.py
  • (1) create a python script
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    ## name voc_label.py

    import xml.etree.ElementTree as ET
    import pickle
    import os
    from os import listdir, getcwd
    from os.path import join

    # 1. change to your labels
    # oppo for example
    # 4 classes, A5s, A7, reno, reno10x
    sets = [('2019_oppo', 'train'), ('2019_oppo', 'val'), ('2019_oppo', 'test')]
    classes = ['A5s', 'A7', 'reno', 'reno10x']

    def convert(size, box):
    dw = 1./(size[0])
    dh = 1./(size[1])
    x = (box[0] + box[1])/2.0 - 1
    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

    def convert_annotation(year, image_id):
    # 2. change to your path
    in_file = open('/home/ares2/darknet/VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
    out_file = open('/home/ares2/darknet/VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
    tree=ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)

    for obj in root.iter('object'):
    difficult = obj.find('difficult').text
    cls = obj.find('name').text
    if cls not in classes or int(difficult)==1:
    continue
    cls_id = classes.index(cls)
    xmlbox = obj.find('bndbox')
    b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
    bb = convert((w,h), b)
    out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')

    wd = getcwd()

    for year, image_set in sets:
    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
    os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
    # 3. change to your path
    image_ids = open('/home/ares2/darknet/VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
    list_file = open('%s_%s.txt'%(year, image_set), 'w')
    for image_id in image_ids:
    list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
    convert_annotation(year, image_id)
    list_file.close()
  • remember that there are 3 places you need to change
  • this will generate 3 files:
    • 2019_oppo_train.txt
    • 2019_oppo_val.txt
      • 2019_oppo_test.txt
      • Usually I merge 2019_oppo_train.txt and 2019_oppo_test.txt as 2019_oppo_train.txt
        1
        2
        3
        4
        cd /path/darknet
        mkdir oppo_od_bak
        cd oppo_od_bak
        mkdir cfg
      • from the darknet/cfg/ you can find the yolo-voc.cfg and the yolo-tiny.cfg and from the official website you can download the pretrained models, like for the yolo-voc is darknet53.conv.74.

Prepare your cfg file

  • the 3 files you use to train the yolo is
    • yourdata.names
    • yourdata.data
    • yourcfg.cfg
      • (1) yourdata.names contains the labels of your dataset, each label for a line
      • (2) yourdata.data example
        1
        2
        3
        4
        5
        classes= #classes #类别数目
        train = /path/yourfilename_train.txt # 训练数据
        valid = /path/yourfilenane_val.txt # 验证数据
        names = data/yourname.names # class labels
        backup = /backup/ # 权重保存所在文件
      • remember to delete the comments
      • (3) yourcfg.cfg
      • you can use the yolo-voc.cfg or the yolo-tiny.cfg
      • remember to change these places
1
2
3
4
5
6
7
8
9
10
vim yolo-voc.cfg

## Remember to comment the testing and uncomment the training
[net]
# Testing
# batch=1
# subdivisions=1
# Training
batch=64
subdivisions=16
  • YOU should change every [yolo] layer.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    [convolutional]
    size=1
    stride=1
    pad=1
    filters=27 ## YOU SHOULD CHANGE THE # OF FILTERS
    ## filters = (classes + 5) * 3
    activation=linear

    [yolo]
    mask = 6,7,8
    anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
    classes=4 ## CHANGE TO THE NUMBER OF YOUR LABELS
    num=9
    jitter=.3
    ignore_thresh = .5
    truth_thresh = 1
    random=1

Start Training

  • First time you train, use the pretrained classification model
    1
    2
    3
    4
    5
    cd path/darknet/
    # yolo-tiny
    ./darknet detector train cfg/yourdata.data cfg/yourcfg.cfg backup/bo_can_tiny_176.weights
    # yolo-voc
    ./darknet detector train cfg/yourdata.data cfg/yourcfg.cfg backup/darknet53.conv.74

Know your log

1
2
3
4
5
6
Region 82 Avg IOU: 0.801934, Class: 0.737764, Obj: 0.782024, No Obj: 0.006216, .5R: 1.000000, .75R: 1.000000, count: 5 
Region 94 Avg IOU: 0.706899, Class: 0.073915, Obj: 0.544467, No Obj: 0.000506, .5R: 1.000000, .75R: 0.000000, count: 1
Region 106 Avg IOU: 0.831056, Class: 0.037965, Obj: 0.026004, No Obj: 0.000057, .5R: 1.000000, .75R: 1.000000, count: 1
Region 82 Avg IOU: 0.731572, Class: 0.800899, Obj: 0.793200, No Obj: 0.005694, .5R: 1.000000, .75R: 0.333333, count: 3
Region 94 Avg IOU: 0.607969, Class: 0.199724, Obj: 0.884315, No Obj: 0.000286, .5R: 1.000000, .75R: 0.000000, count: 1
Region 106 Avg IOU: -nan, Class: -nan, Obj: -nan, No Obj: 0.000015, .5R: -nan, .75R: -nan, count:
  • (1)以上输出显示了所有训练图片的一个批次(batch),批次大小的划分根据我们在 .cfg 文件中设置的subdivisions参数。在我使用的 .cfg 文件中 batch = 64 ,subdivision = 16,所以在训练输出中,训练迭代包含了16组,每组又包含了4张图片,跟设定的batch和subdivision的值一致。
    但是此处有16*3条信息,每组包含三条信息,分别是:
    Region 82 Avg IOU:
    Region 94 Avg IOU:
    Region 106 Avg IOU:
    三个尺度上预测不同大小的框 82卷积层 为最大的预测尺度,使用较大的mask,但是可以预测出较小的物体 94卷积层 为中间的预测尺度,使用中等的mask, 106卷积层为最小的预测尺度,使用较小的mask,可以预测出较大的物体
  • (2)每个batch都会有这样一个输出:
    1
    2706: 1.350835, 1.386559 avg, 0.001000 rate, 3.323842 seconds, 173184 images

2706:batch是第几组。
1.350835:总损失
1.386559 avg : 平均损失
0.001000 rate:当前的学习率
3.323842 seconds: 当前batch训练所花的时间
173184 images : 目前为止参与训练的图片总数 = 2706 * 64

  • (3)
    1
    Region 82 Avg IOU: 0.798032, Class: 0.559781, Obj: 0.515851, No Obj: 0.006533, .5R: 1.000000, .75R: 1.000000,  count: 2

Region Avg IOU: 表示在当前subdivision内的图片的平均IOU,代表预测的矩形框和真实目标的交集与并集之比.
Class: 标注物体分类的正确率,期望该值趋近于1。
Obj: 越接近1越好。
No Obj: 期望该值越来越小,但不为零。
count: count后的值是所有的当前subdivision图片(本例中一共4张)中包含正样本的图片的数量。


Training experience

  • YOLO-TINY

    • It’s a simple network for feature extraction, fit to the simple circumstances.
    • Each class should have more than 500 images
    • Training more than 1000 epoches
    • Fast but low accurate.
  • YOLO-VOC

    • It’s a complicated network training on the Imagenet
    • Each class should have more than 300 images
    • Traing more than 10000 epoches.
    • Slow but accurate
  • Overall, more images, the model will be better. You can try to add images slowly.

How to run your own yolov3 model with Opencv

  • first you need to install the opencv
  • then, you just copy three files from what you have trained
    1
    2
    3
    yourname.name
    yourcfg.cfg
    yourweights.weights
  • then set them in the config file
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    class FLAGS:
    # Initialize the parameters
    confThreshold = 0.65 # Confidence threshold
    nmsThreshold = 0.3 # Non-maximum suppression threshold
    inpWidth = 416 # Width of network's input image
    inpHeight = 416 # Height of network's input image

    camera_id = 0

    # Load names of classes
    classesFile = "./shelves_od_300/shelves_od.names"
    classes = None
    with open(classesFile, 'rt') as f:
    classes = f.read().rstrip('\n').split('\n')

    # Give the configuration and weight files for the model and load the network using them
    modelConfiguration = "./shelves_od_300/yolov3-voc.cfg"
    modelWeights = "./shelves_od_300/yolov3-voc_latest.weights"
  • Finally, run the script below
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    # This code is written at BigVision LLC. It is based on the OpenCV project. It is subject to the license terms in the LICENSE file found in this distribution and at http://opencv.org/license.html

    # Usage example: python3 object_detection_yolo.py --video=run.mp4
    # python3 object_detection_yolo.py --image=bird.jpg

    import cv2 as cv
    import argparse
    import sys
    import numpy as np
    import os.path
    import uuid

    from config import FLAGS

    # Initialize the parameters
    confThreshold = FLAGS.confThreshold #Confidence threshold
    nmsThreshold = FLAGS.nmsThreshold #Non-maximum suppression threshold
    inpWidth = FLAGS.inpWidth #Width of network's input image
    inpHeight = FLAGS.inpHeight #Height of network's input image

    classes = FLAGS.classes
    global _i
    _i = 1000
    # Get the =-.l2 of the output layers
    def getOutputsNames(net):
    # Get the names of all the layers in the network
    layersNames = net.getLayerNames()
    # Get the names of the output layers, i.e. the layers with unconnected outputs
    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]

    # Draw the predicted bounding box
    def drawPred(frame, classId, conf, left, top, right, bottom):
    # Draw a bounding box.
    cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)

    label = '%.2f' % conf

    # Get the label for the class name and its confidence
    if classes:
    assert(classId < len(classes))
    label = '%s:%s' % (classes[classId], label)

    #Display the label at the top of the bounding box
    labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
    top = max(top, labelSize[1])
    cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)
    cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)

    # Remove the bounding boxes with low confidence using non-maxima suppression
    def postprocess(frame, outs):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]

    # Scan through all the bounding boxes output from the network and keep only the
    # ones with high confidence scores. Assign the box's class label as the class with the highest score.
    classIds = []
    confidences = []
    boxes = []
    for out in outs:
    for detection in out:
    scores = detection[5:]
    classId = np.argmax(scores)
    confidence = scores[classId]
    if confidence > confThreshold:
    center_x = int(detection[0] * frameWidth)
    center_y = int(detection[1] * frameHeight)
    width = int(detection[2] * frameWidth)
    height = int(detection[3] * frameHeight)
    left = int(center_x - width / 2)
    top = int(center_y - height / 2)
    classIds.append(classId)
    confidences.append(float(confidence))
    boxes.append([left, top, width, height])
    global _i
    # Perform non maximum suppression to eliminate redundant overlapping boxes with
    # lower confidences.
    indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
    for i in indices:
    i = i[0]
    box = boxes[i]
    left = box[0]
    top = box[1]
    width = box[2]
    height = box[3]

    ## save crop image
    crop_img = frame[top:top+height, left:left+width, ]
    #resized_img = cv.resize(crop_img, (100, 100))
    #if _i % 5 == 0:
    #cv.imwrite('save_imgs/'+str(uuid.uuid1())+'.jpg', crop_img)
    _i = _i + 1
    drawPred(frame, classIds[i], confidences[i], left, top, left + width, top + height)



    def processing_yolov3(args):

    net = cv.dnn.readNetFromDarknet(FLAGS.modelConfiguration, FLAGS.modelWeights)
    net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
    net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

    # Process inputs
    winName = 'Deep learning object detection in OpenCV'
    cv.namedWindow(winName, cv.WINDOW_NORMAL)

    outputFile = "yolo_out_py.avi"
    if (args.image):
    # Open the image file
    if not os.path.isfile(args.image):
    print("Input image file ", args.image, " doesn't exist")
    sys.exit(1)
    cap = cv.VideoCapture(args.image)
    outputFile = args.image[:-4]+'_yolo_out_py.jpg'
    elif (args.video):
    # Open the video file
    if not os.path.isfile(args.video):
    print("Input video file ", args.video, " doesn't exist")
    sys.exit(1)
    cap = cv.VideoCapture(args.video)
    outputFile = args.video[:-4]+'_yolo_out_py.avi'
    else:
    # Webcam input
    cap = cv.VideoCapture(FLAGS.camera_id)

    cap.set(3, 720)
    cap.set(4, 1280)

    # Get the video writer initialized to save the output video
    if (not args.image):
    vid_writer = cv.VideoWriter(outputFile, cv.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))

    while cv.waitKey(1) < 0:

    # get frame from the video
    hasFrame, frame = cap.read()

    # Stop the program if reached end of video
    if not hasFrame:
    print("Done processing !!!")
    print("Output file is stored as ", outputFile)
    cv.waitKey(3000)
    # Release device
    cap.release()
    break

    # Create a 4D blob from a frame.
    blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)

    # Sets the input to the network
    net.setInput(blob)

    # Runs the forward pass to get output of the output layers
    outs = net.forward(getOutputsNames(net))

    # Remove the bounding boxes with low confidence
    postprocess(frame, outs)

    # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
    t, _ = net.getPerfProfile()
    label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
    cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))

    # Write the frame with the detection boxes
    if (args.image):
    cv.imwrite(outputFile, frame.astype(np.uint8))
    else:
    vid_writer.write(frame.astype(np.uint8))

    cv.imshow(winName, frame)

    if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
    parser.add_argument('--image', help='Path to image file.')
    parser.add_argument('--video', help='Path to video file.')
    args = parser.parse_args()

    processing_yolov3(args)