#1. Algorithm source program:
[net]
# Testing
# batch=1
# subdivisions=1
# Training
batch=64 # Number of training samples
subdivisions=16  #net->batch /= subdivisions
width=416
height=416
channels=3
momentum=0.9 #Momentum
decay=0.0005    #Weight decay regularization
angle=0    #Rotation angle data enhancement
saturation = 1.5 #Saturation data enhancement
exposure = 1.5    #Adjust exposure data enhancement
hue=.1    #Adjust tone data enhancement

learning_rate=0.001 #The learning rate determines the speed of weight update.
# When the number of iterations is less than burn _ in, there is one way to update the learning rate, which is greater than burn _ in, using policyburn _ in = 1000
max_batches = 50200 #Number of iteration stops
policy=steps    # Learning rate update strategy
steps=40000,45000    #steps Update policy
scales=.1,.1



[convolutional]
batch_normalize=1 # BN processing or not
filters=32    # Number of convolution kernels and outputs
size=3    # Convolution kernel size
stride=1
pad=1
activation=leaky
# The convolution kernel 3 * 3 with the padding step size of 1 does not change the feature map size, and the padding step size of 2 changes the original size by half



# Downsample
[convolutional]
batch_normalize=1
filters=64
size=3
stride=2
pad=1
activation=leaky # Network layer activation function

[convolutional]
batch_normalize=1
filters=32
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

# Downsample
[convolutional]
batch_normalize=1
filters=128
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3   # indicates the first three layers, which is Resnet.
activation=linear #Activation function

[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

# Downsample
[convolutional]
batch_normalize=1
filters=256
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky


。。。Repeated in the middleconv。。。

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=75
#filters = num*(classed+5), The meaning of 5 is 4 coordinates + confidence, num represents the number of boxes predicted by each cell in yolo, which is 3, VOC data set is 20 classes, and coco data set is 80 classes.
activation=linear


[yolo]
mask = 6,7,8 # The index of the anchor corresponding to the size of different scales.
# size of anchor

anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
classes=20
num=9  # Each grid cell predicts a total of several boxes, consistent with the number of anchors.
jitter=.3 # Data enhancement method: jitter is the range of randomly adjusting the aspect ratio.
ignore_thresh = .5 # The IOU threshold involved in the calculation. When the IOU between the predicted detection frame and the ground truth is greater than the ignore _ thre, the detection frame participates in the loss calculation. Otherwise, the detection frame does not participate in the loss calculation.
truth_thresh = 1
random=1

# The routing layer can contain attributes with one or two values. When the attribute has only one value, it outputs the feature map of the layer indexed by it. In the example, it is -4, so the routing layer will output the feature map of the fourth layer from the route layer.
[route]
layers = -4

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[upsample]
stride=2

# When an attribute has two values, it returns the spliced feature map of the layer indexed by its value, -1 and 61, and the routing layer will output the feature maps of the previous layer (-1) and layer 61, spliced along the depth dimension.
[route]
layers = -1, 61



[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=512
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=512
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=512
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=75
activation=linear

[yolo]
mask = 3,4,5
anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
classes=20
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1

[route]
layers = -4

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[upsample]
stride=2

[route]
layers = -1, 36



[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=256
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=256
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=256
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=75
activation=linear


# There are 9 anchors, but only those with mask tags are used. Here mask = 0, 1, 2 means that the first, second, and third anchors are used. Each cell predicts 3 boxes. In total, our detection network has 3 scales and 9 anchors in total.
[yolo]
mask = 0,1,2
anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
classes=20
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1