Source code for gravityspy.ml.train_semantic_index

import keras.backend as K
K.set_image_data_format("channels_first")
from GS_utils import cosine_distance
from keras import regularizers
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.layers import Input, Dense, GlobalAveragePooling2D, Lambda
from keras.models import Model,load_model
from keras.optimizers import RMSprop

from gravityspy.utils import log
import numpy
import os

[docs]def pickle_trainingset(path_to_trainingset,
                       save_address='pickleddata/trainingset.pkl',
                       verbose=False):
    """Pre-processes the training set images and save to pickle.

    Parameters:

        path_to_trainingset (str):
            Path to trainingset where format of training set folder
            is "somedirectoryname"/"classname/"images"

        save_address (str, optional):
            Defaults to `pickleddata`
            Path to folder you would like to save the pixelated training data

        verbose (bool, optional):
            Defaults to False
            Extra verbosity

    Returns:

        A pickled `pandas.DataFrame`:
            with rows of samples
            and columns containing the pixelated 0.5, 1.0, 2.0,
            and 4.0 duration images as well as a column with the True
            Label and a column with an ID that uniquely identifies that sample
    """

    logger = log.Logger('Gravity Spy: Pickling '
                        'Trainingset RGB')

    if not os.path.exists(os.path.dirname(save_address)):
        if verbose:
            logger.info('making... ' + os.path.dirname(save_address))
        os.makedirs(os.path.dirname(save_address))

    classes = sorted(os.listdir(path_to_trainingset))
    nb_classes = len(classes)
    logger.info('The number of classes are {0}'.format(nb_classes))
    logger.info('The classes you are pickling are {0}'.format(
          classes))

    data = pd.DataFrame()
    for iclass in classes:
        logger.info('Converting {0} into RGB info'.format(iclass))
        images = sorted(os.listdir(os.path.join(path_to_trainingset, iclass)))
        images = [imageidx for imageidx in images \
                  if 'L1_' in imageidx or 'H1_' in imageidx or 'V1_' in imageidx]
        # Group each sample into sets of 4 different durations
        samples = zip(*(iter(images),) * 4)
        for isample in samples:
            tmpDF = pd.DataFrame()
            for idur in isample:
                if verbose:
                    logger.info('Converting {0}'.format(idur))
                image_data_r, image_data_g, image_data_b  = read_image.read_rgb(os.path.join(path_to_trainingset,
                                                              iclass, idur),
                                                              resolution=0.3)
                information_on_image = idur.split('_')
                tmpDF[information_on_image[-1]] = [[image_data_r, image_data_g, image_data_b]]
            tmpDF['uniqueID'] = information_on_image[1]
            tmpDF['Label'] = iclass
            data = data.append(tmpDF)

        logger.info('Finished converting {0} into b/w info'.format(iclass))

    picklepath = os.path.join(save_address)
    logger.info('Saving pickled data to {0}'.format(picklepath))
    data.to_pickle(picklepath)
    return data

[docs]def make_model(data, model_folder='model',
               unknown_classes_labels=['Whistle', 'Scratchy'],
               multi_view=False,
               batch_size=22, nb_epoch=10,
               nb_classes=22, fraction_validation=.125, fraction_testing=None,
               best_model_based_validset=0, image_size=[140, 170],
               random_seed=1986, verbose=True):
    """Train a Semantic Index.

    This module uses `keras <https://keras.io/>`_ to interface
    with the creation of a neural net and currently uses
    `theano <http://deeplearning.net/software/theano/>`_ as the backend
    for doing the heavy gpu lifting.

    The optimizer :class:`keras.optimizers.Adadelta`

    The loss function being optimized is `softmax <https://en.wikipedia.org/wiki/Softmax_function>`_

    Parameters:
        data (str):
            Pickle file containing training set data

        model_folder (str, optional):
            Defaults to `model`
            path to folder you would like to save the model

        unknown_classes_labels (list, optional):
            Defaults to ['Whistle', 'Scratchy']
            A list of classes to be considered as the unknown
            domain for which clustering will be performed
            once the knowledge from the known classes
            has been trained.

        batch_size (int, optional):
            Default 22

        nb_epoch (int, optional):
            Default 40

        nb_classes (int, optional):
            Default 22

        fraction_validation (float, optional):
            Default .125

        fraction_testing (float, optional):
            Default None

        all_data_for_train_flag (int, optional):
            Default 1

        best_model_based_validset (int,optional):
    """
    logger = log.Logger('Gravity Spy: Training '
                        'Semantic Index')

    logger.info('Using random seed {0}'.format(random_seed))
    numpy.random.seed(random_seed)  # for reproducibility

    logger.info('You data set contained {0} samples'.format(len(data)))

    img_rows, img_cols = image_size[0], image_size[1]

    logger.info('The size of the images being trained {0}'.format(image_size))

    known_df = data.loc[~data.Label.isin(unknown_classes_labels)]

    logger.info('Given unknown images here is what is to be considered in '
                'the known '
                'domain of samples which are {0}'.format(knwon_df.Label.unique()))

    logger.info('Selecting images to be considered in the unknown '
                'domain of samples which are {0}'.format(unknown_classes_labels))
    unknown_df = data.loc[data.Label.isin(unknown_classes_labels)]

    known_x_1 = numpy.vstack(known_df['0.5.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)
    unknownn_x_1 = numpy.vstack(unknown_df['0.5.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)

    known_x_2 = numpy.vstack(known_df['1.0.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)
    unknownn_x_2 = numpy.vstack(unknown_df['1.0.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)

    known_x_3 = numpy.vstack(known_df['2.0.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)
    unknown_x_3 = numpy.vstack(unknown_df['2.0.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)

    known_x_4 = numpy.vstack(known_df['4.0.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)
    unknown_x_4 = numpy.vstack(unknown_df['4.0.png'].values).reshape(
                                                     -1, 1, img_rows, img_cols)

    test_set_unlabelled_x_1 = image_data.filter(regex=("1.0.png")).iloc[0].iloc[0]
    test_set_unlabelled_x_2 = image_data.filter(regex=("2.0.png")).iloc[0].iloc[0]
    test_set_unlabelled_x_3 = image_data.filter(regex=("4.0.png")).iloc[0].iloc[0]
    test_set_unlabelled_x_4 = image_data.filter(regex=("0.5.png")).iloc[0].iloc[0]
    test_set_unlabelled_x_1 = numpy.concatenate((test_set_unlabelled_x_1[0].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_1[1].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_1[2].reshape(-1, 1, img_rows, img_cols)),
                                             axis=1)
    test_set_unlabelled_x_2 = numpy.concatenate((test_set_unlabelled_x_2[0].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_2[1].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_2[2].reshape(-1, 1, img_rows, img_cols)),
                                             axis=1)
    test_set_unlabelled_x_3 = numpy.concatenate((test_set_unlabelled_x_3[0].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_3[1].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_3[2].reshape(-1, 1, img_rows, img_cols)),
                                             axis=1)
    test_set_unlabelled_x_4 = numpy.concatenate((test_set_unlabelled_x_4[0].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_4[1].reshape(-1, 1, img_rows, img_cols),
                                              test_set_unlabelled_x_4[2].reshape(-1, 1, img_rows, img_cols)),
                                             axis=1)

    if multi_view:
        known_classes = concatenate_views(known_set_x_1, known_set_x_2,
                            known_set_x_3, known_set_x_4, [img_rows, img_cols], True)
        unknown_classes = concatenate_views(unknown_set_x_1, unknown_set_x_2,
                            unknown_set_x_3, unknown_set_x_4, [img_rows, img_cols], True)
    else:
        # We are only using one duration for the similarity search
        known_classes_image_array = known_set_x_2
        unknown_classes_image_array = unknown_set_x_2

    # Generate the Binary pairs for training.
    train_generator = create_pairs3_gen(known_classes, known_classes_indices_for_metric_learning, size_of_batch)
    valid_generator = create_pairs3_gen(unknown_classes, unknown_classes_indices_for_clustering, size_of_batch)

    # Create the model
    vgg16 = VGG16(weights='imagenet', include_top=False,
                  input_shape=data_x_1.shape[1:])
    x = vgg16.output
    x = GlobalAveragePooling2D()(x)
    # let's add a fully-connected layer
    x = Dense(1024, kernel_regularizer=regularizers.l2(reglularization))(x)
    predictions = Dense(200, activation='relu')(x)


    #Then create the corresponding model
    base_network = Model(input=vgg16.input, output=predictions)

    if multi_view:
        img_cols = 2*img_cols
        img_rows = 2*img_rows

    input_a = Input(shape=(3, img_rows, img_cols))
    input_b = Input(shape=(3, img_rows, img_cols))

    processed_a = base_network(input_a)
    processed_b = base_network(input_b)

    distance = Lambda(cosine_distance,
                      output_shape=eucl_dist_output_shape)(
                          [processed_a, processed_b]
                      )

    similarity_model = Model(inputs=[input_a, input_b], outputs=distance)
    semantic_idx_model = Model(inputs=[input_a], outputs=processed_a)

    number_of_cnn_from_vgg = 0
    for i in range(len(vgg16.layers)-number_of_cnn_from_vgg):
        vgg16.layers[i].trainable = False


    similarity_model.summary()
    semantic_idx_model.summary()
    rms = RMSprop()

    similarity_model.compile(loss=contrastive_loss, optimizer=rms,
                             metrics=[siamese_acc(0.3), siamese_acc(0.4),
                                      siamese_acc(0.5), siamese_acc(0.6),
                                      siamese_acc(0.7), siamese_acc(0.8),
                                      siamese_acc(0.9), siamese_acc(0.925),
                                      siamese_acc(0.95), siamese_acc(0.975),
                                      siamese_acc(0.985),siamese_acc(0.99)])

    # train
    logger.info('training the model ...')
    out_file.write('training the model ...' + '\n')

    train_negative_factor = 1
    test_negative_factor = 1
    # the samples from data_x_2 should be separated for test and valid in future

    train_batch_num = (len(data_x_1) * (train_negative_factor + 1)) / size_of_batch
    logger.info('train batch num {0}'.format(train_batch_num))
    valid_batch_num = (len(data_x_2) * (test_negative_factor + 1)) / size_of_batch

    similarity_model.fit_generator(train_generator, validation_data=valid_generator, verbose=2,
                        steps_per_epoch=train_batch_num, validation_steps=valid_batch_num, epochs=nb_epoch)

    # validation
    logger.info('validating the model')

    logger.info('Known classes')
    res1 = similarity_model.evaluate_generator(train_generator, train_batch_num)
    logger.info(res1)

    logger.info(' unknown classes')
    res2 = similarity_model.evaluate_generator(valid_generator, valid_batch_num)
    logger.info(res2)

    similarity_model.compile(loss='mean_squared_error', optimizer='rmsprop')
    similarity_model.save(os.join.path(model_adr, 'similarity_metric_model.h5'))
    semantic_idx_model.save(os.join.path(model_adr, 'semantic_idx_model.h5'))