caffe的大多数层是由c++写成的,借助于c++的高效性,网络可以快速训练。但是我们有时候需要自己写点输入层以应对各种不同的数据输入,比如你因为是需要在图像中取块而不想写成LMDB,这时候可以考虑使用python直接写一个层。而且输入层不需要GPU加速,所需写起来也比较容易。
python层怎么用
先看一个网上的例子吧(来自http://chrischoy./research/caffe-python-layer/)
- layer {
- type: 'Python'
- name: 'loss'
- top: 'loss'
- bottom: 'ipx'
- bottom: 'ipy'
- python_param {
- # the module name -- usually the filename -- that needs to be in $PYTHONPATH
- module: 'pyloss'
- # the layer name -- the class name in the module
- layer: 'EuclideanLossLayer'
- }
- # set loss weight so Caffe knows this is a loss layer
- loss_weight: 1
- }
这里的type就只有Python一种,然后top,bottom和常见的层是一样的,module就是你的python module名字,一般就是文件名,然后layer就是定义的类的名字。
python层怎么写
这里就以 Fully Convolutional Networks for Semantic Segmentation 论文中公布的代码作为示例,解释python层该怎么写。
- import caffe
- import numpy as np
- from PIL import Image
- import random
- class VOCSegDataLayer(caffe.Layer):
- """ Load (input image, label image) pairs from PASCAL VOC one-at-a-time while reshaping the net to preserve dimensions. Use this to feed data to a fully convolutional network. """
- def setup(self, bottom, top):
- """ Setup data layer according to parameters: - voc_dir: path to PASCAL VOC year dir - split: train / val / test - mean: tuple of mean values to subtract - randomize: load in random order (default: True) - seed: seed for randomization (default: None / current time) for PASCAL VOC semantic segmentation. example params = dict(voc_dir="/path/to/PASCAL/VOC2011", mean=(104.00698793, 116.66876762, 122.67891434), split="val") """
- # config
- params = eval(self.param_str)
- self.voc_dir = params['voc_dir']
- self.split = params['split']
- self.mean = np.array(params['mean'])
- self.random = params.get('randomize', True)
- self.seed = params.get('seed', None)
- # two tops: data and label
- if len(top) != 2:
- raise Exception("Need to define two tops: data and label.")
- # data layers have no bottoms
- if len(bottom) != 0:
- raise Exception("Do not define a bottom.")
- # load indices for images and labels
- split_f = '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,
- self.split)
- self.indices = open(split_f, 'r').read().splitlines()
- self.idx = 0
- # make eval deterministic
- if 'train' not in self.split:
- self.random = False
- # randomization: seed and pick
- if self.random:
- random.seed(self.seed)
- self.idx = random.randint(0, len(self.indices)-1)
- def reshape(self, bottom, top):
- # load image + label image pair
- self.data = self.load_image(self.indices[self.idx])
- self.label = self.load_label(self.indices[self.idx])
- # reshape tops to fit (leading 1 is for batch dimension)
- top[0].reshape(1, *self.data.shape)
- top[1].reshape(1, *self.label.shape)
- def forward(self, bottom, top):
- # assign output
- top[0].data[...] = self.data
- top[1].data[...] = self.label
- # pick next input
- if self.random:
- self.idx = random.randint(0, len(self.indices)-1)
- else:
- self.idx += 1
- if self.idx == len(self.indices):
- self.idx = 0
- def backward(self, top, propagate_down, bottom):
- pass
- def load_image(self, idx):
- """ Load input image and preprocess for Caffe: - cast to float - switch channels RGB -> BGR - subtract mean - transpose to channel x height x width order """
- im = Image.open('{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))
- in_ = np.array(im, dtype=np.float32)
- in_ = in_[:,:,::-1]
- in_ -= self.mean
- in_ = in_.transpose((2,0,1))
- return in_
- def load_label(self, idx):
- """ Load label image as 1 x height x width integer array of label indices. The leading singleton dimension is required by the loss. """
- im = Image.open('{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))
- label = np.array(im, dtype=np.uint8)
- label = label[np.newaxis, ...]
- return label
- class SBDDSegDataLayer(caffe.Layer):
- """ Load (input image, label image) pairs from the SBDD extended labeling of PASCAL VOC for semantic segmentation one-at-a-time while reshaping the net to preserve dimensions. Use this to feed data to a fully convolutional network. """
- def setup(self, bottom, top):
- """ Setup data layer according to parameters: - sbdd_dir: path to SBDD `dataset` dir - split: train / seg11valid - mean: tuple of mean values to subtract - randomize: load in random order (default: True) - seed: seed for randomization (default: None / current time) for SBDD semantic segmentation. N.B.segv11alid is the set of segval11 that does not intersect with SBDD. Find it here: https://gist.github.com/shelhamer/edb330760338892d511e. example params = dict(sbdd_dir="/path/to/SBDD/dataset", mean=(104.00698793, 116.66876762, 122.67891434), split="valid") """
- # config
- params = eval(self.param_str)
- self.sbdd_dir = params['sbdd_dir']
- self.split = params['split']
- self.mean = np.array(params['mean'])
- self.random = params.get('randomize', True)
- self.seed = params.get('seed', None)
- # two tops: data and label
- if len(top) != 2:
- raise Exception("Need to define two tops: data and label.")
- # data layers have no bottoms
- if len(bottom) != 0:
- raise Exception("Do not define a bottom.")
- # load indices for images and labels
- split_f = '{}/{}.txt'.format(self.sbdd_dir,
- self.split)
- self.indices = open(split_f, 'r').read().splitlines()
- self.idx = 0
- # make eval deterministic
- if 'train' not in self.split:
- self.random = False
- # randomization: seed and pick
- if self.random:
- random.seed(self.seed)
- self.idx = random.randint(0, len(self.indices)-1)
- def reshape(self, bottom, top):
- # load image + label image pair
- self.data = self.load_image(self.indices[self.idx])
- self.label = self.load_label(self.indices[self.idx])
- # reshape tops to fit (leading 1 is for batch dimension)
- top[0].reshape(1, *self.data.shape)
- top[1].reshape(1, *self.label.shape)
- def forward(self, bottom, top):
- # assign output
- top[0].data[...] = self.data
- top[1].data[...] = self.label
- # pick next input
- if self.random:
- self.idx = random.randint(0, len(self.indices)-1)
- else:
- self.idx += 1
- if self.idx == len(self.indices):
- self.idx = 0
- def backward(self, top, propagate_down, bottom):
- pass
- def load_image(self, idx):
- """ Load input image and preprocess for Caffe: - cast to float - switch channels RGB -> BGR - subtract mean - transpose to channel x height x width order """
- im = Image.open('{}/img/{}.jpg'.format(self.sbdd_dir, idx))
- in_ = np.array(im, dtype=np.float32)
- in_ = in_[:,:,::-1]
- in_ -= self.mean
- in_ = in_.transpose((2,0,1))
- return in_
- def load_label(self, idx):
- """ Load label image as 1 x height x width integer array of label indices. The leading singleton dimension is required by the loss. """
- import scipy.io
- mat = scipy.io.loadmat('{}/cls/{}.mat'.format(self.sbdd_dir, idx))
- label = mat['GTcls'][0]['Segmentation'][0].astype(np.uint8)
- label = label[np.newaxis, ...]
- return label
每个类都是层,类的名字就是layer参数的名字。这两个都是数据输入层,由于需要一个data,一个label,所以有两个top,没有bottomo。 类直接继承的是caffe.Layer,然后必须重写setup(),reshape(),forward(),backward()函数,其他的函数可以自己定义,没有限制。 setup()是类启动时该做的事情,比如层所需数据的初始化。 reshape()就是取数据然后把它规范化为四维的矩阵。每次取数据都会调用此函数。 forward()就是网络的前向运行,这里就是把取到的数据往前传递,因为没有其他运算。 backward()就是网络的反馈,data层是没有反馈的,所以这里就直接pass。
PS
这里就把一些资料整合起来,以供参考吧。 1、caffe官网现在开始有了点pycaffe的资料,但是鉴于caffe经常更新,不知道什么时候就把它删除,所需摘录到此。 文件: pyloss.py
- import caffe
- import numpy as np
- class EuclideanLossLayer(caffe.Layer):
- """ Compute the Euclidean Loss in the same manner as the C++ EuclideanLossLayer to demonstrate the class interface for developing layers in Python. """
- def setup(self, bottom, top):
- # check input pair
- if len(bottom) != 2:
- raise Exception("Need two inputs to compute distance.")
- def reshape(self, bottom, top):
- # check input dimensions match
- if bottom[0].count != bottom[1].count:
- raise Exception("Inputs must have the same dimension.")
- # difference is shape of inputs
- self.diff = np.zeros_like(bottom[0].data, dtype=np.float32)
- # loss output is scalar
- top[0].reshape(1)
- def forward(self, bottom, top):
- self.diff[...] = bottom[0].data - bottom[1].data
- top[0].data[...] = np.sum(self.diff**2) / bottom[0].num / 2.
- def backward(self, top, propagate_down, bottom):
- for i in range(2):
- if not propagate_down[i]:
- continue
- if i == 0:
- sign = 1
- else:
- sign = -1
- bottom[i].diff[...] = sign * self.diff / bottom[i].num
下面这个就是如何使用这个层了: linreg.prototxt
- name: 'LinearRegressionExample'
- # define a simple network for linear regression on dummy data
- # that computes the loss by a PythonLayer.
- layer {
- type: 'DummyData'
- name: 'x'
- top: 'x'
- dummy_data_param {
- shape: { dim: 10 dim: 3 dim: 2 }
- data_filler: { type: 'gaussian' }
- }
- }
- layer {
- type: 'DummyData'
- name: 'y'
- top: 'y'
- dummy_data_param {
- shape: { dim: 10 dim: 3 dim: 2 }
- data_filler: { type: 'gaussian' }
- }
- }
- # include InnerProduct layers for parameters
- # so the net will need backward
- layer {
- type: 'InnerProduct'
- name: 'ipx'
- top: 'ipx'
- bottom: 'x'
- inner_product_param {
- num_output: 10
- weight_filler { type: 'xavier' }
- }
- }
- layer {
- type: 'InnerProduct'
- name: 'ipy'
- top: 'ipy'
- bottom: 'y'
- inner_product_param {
- num_output: 10
- weight_filler { type: 'xavier' }
- }
- }
- layer {
- type: 'Python'
- name: 'loss'
- top: 'loss'
- bottom: 'ipx'
- bottom: 'ipy'
- python_param {
- # the module name -- usually the filename -- that needs to be in $PYTHONPATH
- module: 'pyloss'
- # the layer name -- the class name in the module
- layer: 'EuclideanLossLayer'
- }
- # set loss weight so Caffe knows this is a loss layer.
- # since PythonLayer inherits directly from Layer, this isn't automatically # known to Caffe loss_weight: 1 }
pascal_multilabel_datalayers.py
- # imports
- import json
- import time
- import pickle
- import scipy.misc
- import skimage.io
- import caffe
- import numpy as np
- import os.path as osp
- from xml.dom import minidom
- from random import shuffle
- from threading import Thread
- from PIL import Image
- from tools import SimpleTransformer
- class PascalMultilabelDataLayerSync(caffe.Layer):
- """ This is a simple syncronous datalayer for training a multilabel model on PASCAL. """
- def setup(self, bottom, top):
- self.top_names = ['data', 'label']
- # === Read input parameters ===
- # params is a python dictionary with layer parameters.
- params = eval(self.param_str)
- # Check the paramameters for validity.
- check_params(params)
- # store input as class variables
- self.batch_size = params['batch_size']
- # Create a batch loader to load the images.
- self.batch_loader = BatchLoader(params, None)
- # === reshape tops ===
- # since we use a fixed input image size, we can shape the data layer
- # once. Else, we'd have to do it in the reshape call.
- top[0].reshape(
- self.batch_size, 3, params['im_shape'][0], params['im_shape'][1])
- # Note the 20 channels (because PASCAL has 20 classes.)
- top[1].reshape(self.batch_size, 20)
- print_info("PascalMultilabelDataLayerSync", params)
- def forward(self, bottom, top):
- """ Load data. """
- for itt in range(self.batch_size):
- # Use the batch loader to load the next image.
- im, multilabel = self.batch_loader.load_next_image()
- # Add directly to the caffe data layer
- top[0].data[itt, ...] = im
- top[1].data[itt, ...] = multilabel
- def reshape(self, bottom, top):
- """ There is no need to reshape the data, since the input is of fixed size (rows and columns) """
- pass
- def backward(self, top, propagate_down, bottom):
- """ These layers does not back propagate """
- pass
- class BatchLoader(object):
- """ This class abstracts away the loading of images. Images can either be loaded singly, or in a batch. The latter is used for the asyncronous data layer to preload batches while other processing is performed. """
- def __init__(self, params, result):
- self.result = result
- self.batch_size = params['batch_size']
- self.pascal_root = params['pascal_root']
- self.im_shape = params['im_shape']
- # get list of image indexes.
- list_file = params['split'] + '.txt'
- self.indexlist = [line.rstrip('\n') for line in open(
- osp.join(self.pascal_root, 'ImageSets/Main', list_file))]
- self._cur = 0 # current image
- # this class does some simple data-manipulations
- self.transformer = SimpleTransformer()
- print "BatchLoader initialized with {} images".format(
- len(self.indexlist))
- def load_next_image(self):
- """ Load the next image in a batch. """
- # Did we finish an epoch?
- if self._cur == len(self.indexlist):
- self._cur = 0
- shuffle(self.indexlist)
- # Load an image
- index = self.indexlist[self._cur] # Get the image index
- image_file_name = index + '.jpg'
- im = np.asarray(Image.open(
- osp.join(self.pascal_root, 'JPEGImages', image_file_name)))
- im = scipy.misc.imresize(im, self.im_shape) # resize
- # do a simple horizontal flip as data augmentation
- flip = np.random.choice(2)*2-1
- im = im[:, ::flip, :]
- # Load and prepare ground truth
- multilabel = np.zeros(20).astype(np.float32)
- anns = load_pascal_annotation(index, self.pascal_root)
- for label in anns['gt_classes']:
- # in the multilabel problem we don't care how MANY instances
- # there are of each class. Only if they are present.
- # The "-1" is b/c we are not interested in the background
- # class.
- multilabel[label - 1] = 1
- self._cur += 1
- return self.transformer.preprocess(im), multilabel
- def load_pascal_annotation(index, pascal_root):
- """ This code is borrowed from Ross Girshick's FAST-RCNN code (https://github.com/rbgirshick/fast-rcnn). It parses the PASCAL .xml metadata files. See publication for further details: (http:///abs/1504.08083). Thanks Ross! """
- classes = ('__background__', # always index 0
- 'aeroplane', 'bicycle', 'bird', 'boat',
- 'bottle', 'bus', 'car', 'cat', 'chair',
- 'cow', 'diningtable', 'dog', 'horse',
- 'motorbike', 'person', 'pottedplant',
- 'sheep', 'sofa', 'train', 'tvmonitor')
- class_to_ind = dict(zip(classes, xrange(21)))
- filename = osp.join(pascal_root, 'Annotations', index + '.xml')
- # print 'Loading: {}'.format(filename)
- def get_data_from_tag(node, tag):
- return node.getElementsByTagName(tag)[0].childNodes[0].data
- with open(filename) as f:
- data = minidom.parseString(f.read())
- objs = data.getElementsByTagName('object')
- num_objs = len(objs)
- boxes = np.zeros((num_objs, 4), dtype=np.uint16)
- gt_classes = np.zeros((num_objs), dtype=np.int32)
- overlaps = np.zeros((num_objs, 21), dtype=np.float32)
- # Load object bounding boxes into a data frame.
- for ix, obj in enumerate(objs):
- # Make pixel indexes 0-based
- x1 = float(get_data_from_tag(obj, 'xmin')) - 1
- y1 = float(get_data_from_tag(obj, 'ymin')) - 1
- x2 = float(get_data_from_tag(obj, 'xmax')) - 1
- y2 = float(get_data_from_tag(obj, 'ymax')) - 1
- cls = class_to_ind[
- str(get_data_from_tag(obj, "name")).lower().strip()]
- boxes[ix, :] = [x1, y1, x2, y2]
- gt_classes[ix] = cls
- overlaps[ix, cls] = 1.0
- overlaps = scipy.sparse.csr_matrix(overlaps)
- return {'boxes': boxes,
- 'gt_classes': gt_classes,
- 'gt_overlaps': overlaps,
- 'flipped': False,
- 'index': index}
- def check_params(params):
- """ A utility function to check the parameters for the data layers. """
- assert 'split' in params.keys(
- ), 'Params must include split (train, val, or test).'
- required = ['batch_size', 'pascal_root', 'im_shape']
- for r in required:
- assert r in params.keys(), 'Params must include {}'.format(r)
- def print_info(name, params):
- """ Ouput some info regarding the class """
- print "{} initialized for split: {}, with bs: {}, im_shape: {}.".format(
- name,
- params['split'],
- params['batch_size'],
- params['im_shape'])
caffenet.py
- from __future__ import print_function
- from caffe import layers as L, params as P, to_proto
- from caffe.proto import caffe_pb2
- # helper function for common structures
- def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1):
- conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
- num_output=nout, pad=pad, group=group)
- return conv, L.ReLU(conv, in_place=True)
- def fc_relu(bottom, nout):
- fc = L.InnerProduct(bottom, num_output=nout)
- return fc, L.ReLU(fc, in_place=True)
- def max_pool(bottom, ks, stride=1):
- return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride)
- def caffenet(lmdb, batch_size=256, include_acc=False):
- data, label = L.Data(source=lmdb, backend=P.Data.LMDB, batch_size=batch_size, ntop=2,
- transform_param=dict(crop_size=227, mean_value=[104, 117, 123], mirror=True))
- # the net itself
- conv1, relu1 = conv_relu(data, 11, 96, stride=4)
- pool1 = max_pool(relu1, 3, stride=2)
- norm1 = L.LRN(pool1, local_size=5, alpha=1e-4, beta=0.75)
- conv2, relu2 = conv_relu(norm1, 5, 256, pad=2, group=2)
- pool2 = max_pool(relu2, 3, stride=2)
- norm2 = L.LRN(pool2, local_size=5, alpha=1e-4, beta=0.75)
- conv3, relu3 = conv_relu(norm2, 3, 384, pad=1)
- conv4, relu4 = conv_relu(relu3, 3, 384, pad=1, group=2)
- conv5, relu5 = conv_relu(relu4, 3, 256, pad=1, group=2)
- pool5 = max_pool(relu5, 3, stride=2)
- fc6, relu6 = fc_relu(pool5, 4096)
- drop6 = L.Dropout(relu6, in_place=True)
- fc7, relu7 = fc_relu(drop6, 4096)
- drop7 = L.Dropout(relu7, in_place=True)
- fc8 = L.InnerProduct(drop7, num_output=1000)
- loss = L.SoftmaxWithLoss(fc8, label)
- if include_acc:
- acc = L.Accuracy(fc8, label)
- return to_proto(loss, acc)
- else:
- return to_proto(loss)
- def make_net():
- with open('train.prototxt', 'w') as f:
- print(caffenet('/path/to/caffe-train-lmdb'), file=f)
- with open('test.prototxt', 'w') as f:
- print(caffenet('/path/to/caffe-val-lmdb', batch_size=50, include_acc=True), file=f)
- if __name__ == '__main__':
- make_net()
- tools.py
- import numpy as np
- class SimpleTransformer:
- """ SimpleTransformer is a simple class for preprocessing and deprocessing images for caffe. """
- def __init__(self, mean=[128, 128, 128]):
- self.mean = np.array(mean, dtype=np.float32)
- self.scale = 1.0
- def set_mean(self, mean):
- """ Set the mean to subtract for centering the data. """
- self.mean = mean
- def set_scale(self, scale):
- """ Set the data scaling. """
- self.scale = scale
- def preprocess(self, im):
- """ preprocess() emulate the pre-processing occuring in the vgg16 caffe prototxt. """
- im = np.float32(im)
- im = im[:, :, ::-1] # change to BGR
- im -= self.mean
- im *= self.scale
- im = im.transpose((2, 0, 1))
- return im
- def deprocess(self, im):
- """ inverse of preprocess() """
- im = im.transpose(1, 2, 0)
- im /= self.scale
- im += self.mean
- im = im[:, :, ::-1] # change to RGB
- return np.uint8(im)
- class CaffeSolver:
- """ Caffesolver is a class for creating a solver.prototxt file. It sets default values and can export a solver parameter file. Note that all parameters are stored as strings. Strings variables are stored as strings in strings. """
- def __init__(self, testnet_prototxt_path="testnet.prototxt", trainnet_prototxt_path="trainnet.prototxt", debug=False):
- self.sp = {}
- # critical:
- self.sp['base_lr'] = '0.001'
- self.sp['momentum'] = '0.9'
- # speed:
- self.sp['test_iter'] = '100'
- self.sp['test_interval'] = '250'
- # looks:
- self.sp['display'] = '25'
- self.sp['snapshot'] = '2500'
- self.sp['snapshot_prefix'] = '"snapshot"' # string withing a string!
- # learning rate policy
- self.sp['lr_policy'] = '"fixed"'
- # important, but rare:
- self.sp['gamma'] = '0.1'
- self.sp['weight_decay'] = '0.0005'
- self.sp['train_net'] = '"' + trainnet_prototxt_path + '"'
- self.sp['test_net'] = '"' + testnet_prototxt_path + '"'
- # pretty much never change these.
- self.sp['max_iter'] = '100000'
- self.sp['test_initialization'] = 'false'
- self.sp['average_loss'] = '25' # this has to do with the display.
- self.sp['iter_size'] = '1' # this is for accumulating gradients
- if (debug):
- self.sp['max_iter'] = '12'
- self.sp['test_iter'] = '1'
- self.sp['test_interval'] = '4'
- self.sp['display'] = '1'
- def add_from_file(self, filepath):
- """ Reads a caffe solver prototxt file and updates the Caffesolver instance parameters. """
- with open(filepath, 'r') as f:
- for line in f:
- if line[0] == '#':
- continue
- splitLine = line.split(':')
- self.sp[splitLine[0].strip()] = splitLine[1].strip()
- def write(self, filepath):
- """ Export solver parameters to INPUT "filepath". Sorted alphabetically. """
- f = open(filepath, 'w')
- for key, value in sorted(self.sp.items()):
- if not(type(value) is str):
- raise TypeError('All solver parameters must be strings')
- f.write('%s: %s\n' % (key, value))
|