diff --git a/docs/parameters.rst b/docs/parameters.rst index 77d0879..ea987cf 100644 --- a/docs/parameters.rst +++ b/docs/parameters.rst @@ -49,5 +49,16 @@ Here is the full list of configuration parameters you can specify in a ``config. ``'segmentation'`` Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap. +**seed**: int + Random generator seed. Optional, use to make results reproducible. + +**split_vals**: list + Default: `[0.8, 0.2]` + Percentage of data to put in each category listed in split_names. Must be a list of floats that sum to one and match the length of `split-names`. For train, validate, and test data, a list like `[0.7, 0.2, 0.1]` is suggested. + +**split_names**: list + Default: `['train', 'test']` + List of names for each subset of the data. Length of list must match length of `split_vals`. + **imagery_offset**: list of ints An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds. diff --git a/label_maker/package.py b/label_maker/package.py index 4d717c9..480b4a8 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -9,7 +9,8 @@ from label_maker.utils import is_tif -def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs): +def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'], + split_vals=[0.8, .2], **kwargs): """Generate an .npz file containing arrays for training machine learning algorithms Parameters @@ -28,9 +29,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_ ml_type: str Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" seed: int - Random generator seed. Optional, use to make results reproducable. - train_size: float - Portion of the data to use in training, the remainder is used as test data (default 0.8) + Random generator seed. Optional, use to make results reproducible. + split_vals: list + Default: [0.8, 0.2] + Percentage of data to put in each catagory listed in split_names. + Must be floats and must sum to one. + split_names: list + Default: ['train', 'test'] + List of names for each subset of the data. **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ @@ -38,6 +44,11 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_ if seed: np.random.seed(seed) + if len(split_names) != len(split_vals): + raise ValueError('`split_names` and `split_vals` must be the same length. Please update your config.') + if not np.isclose(sum(split_vals), 1): + raise ValueError('`split_vals` must sum to one. Please update your config.') + # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') labels = np.load(labels_file) @@ -60,7 +71,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_ # open the images and load those plus the labels into the final arrays o = urlparse(imagery) _, image_format = op.splitext(o.path) - if is_tif(imagery): # if a TIF is provided, use jpg as tile format + if is_tif(imagery): # if a TIF is provided, use jpg as tile format image_format = '.jpg' for tile in tiles: image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format)) @@ -86,16 +97,28 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_ elif ml_type == 'segmentation': y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel - # split into train and test - split_index = int(len(x_vals) * train_size) - - # convert lists to numpy arrays + # Convert lists to numpy arrays x_vals = np.array(x_vals, dtype=np.uint8) y_vals = np.array(y_vals, dtype=np.uint8) - print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) - np.savez(op.join(dest_folder, 'data.npz'), - x_train=x_vals[:split_index, ...], - y_train=y_vals[:split_index, ...], - x_test=x_vals[split_index:, ...], - y_test=y_vals[split_index:, ...]) + # Get number of data samples per split from the float proportions + split_n_samps = [len(x_vals) * val for val in split_vals] + + if np.any(split_n_samps == 0): + raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.') + + # Convert into a cumulative sum to get indices + split_inds = np.cumsum(split_n_samps).astype(np.integer) + + # Exclude last index as `np.split` handles splitting without that value + split_arrs_x = np.split(x_vals, split_inds[:-1]) + split_arrs_y = np.split(y_vals, split_inds[:-1]) + + save_dict = {} + + for si, split_name in enumerate(split_names): + save_dict[f'x_{split_name}'] = split_arrs_x[si] + save_dict[f'y_{split_name}'] = split_arrs_y[si] + + np.savez(op.join(dest_folder, 'data.npz'), **save_dict) + print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) \ No newline at end of file diff --git a/label_maker/validate.py b/label_maker/validate.py index 2a60425..13c34fb 100644 --- a/label_maker/validate.py +++ b/label_maker/validate.py @@ -30,5 +30,7 @@ 'background_ratio': {'type': 'float'}, 'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True}, 'seed': {'type': 'integer'}, - 'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2} + 'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}, + 'split_vals': {'type': 'list', 'schema': {'type': 'float'}}, + 'split_names': {'type': 'list', 'schema': {'type': 'string'}} } diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json new file mode 100644 index 0000000..e6a144c --- /dev/null +++ b/test/fixtures/integration/config_3way.integration.json @@ -0,0 +1,23 @@ +{"country": "portugal", + "bounding_box": [ + -9.4575, + 38.8467, + -9.4510, + 38.8513 + ], + "zoom": 17, + "classes": [ + { "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] }, + { "name": "Building", "filter": ["has", "building"] }, + { "name": "Farmland", "filter": ["==", "landuse", "farmland"] }, + { "name": "Ruins", "filter": ["==", "historic", "ruins"] }, + { "name": "Parking", "filter": ["==", "amenity", "parking"] }, + { "name": "Roads", "filter": ["has", "highway"] } + ], + "imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN", + "background_ratio": 1, + "ml_type": "classification", + "seed": 19, + "split_names": ["train", "test", "val"], + "split_vals": [0.7, 0.2, 0.1] +} diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py index db5dc3b..5da901b 100644 --- a/test/integration/test_classification_package.py +++ b/test/integration/test_classification_package.py @@ -7,17 +7,24 @@ import numpy as np + class TestClassificationPackage(unittest.TestCase): """Tests for classification package creation""" + @classmethod def setUpClass(cls): makedirs('integration-cl') copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz') copytree('test/fixtures/integration/tiles', 'integration-cl/tiles') + makedirs('integration-cl-split') + copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz') + copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles') + @classmethod def tearDownClass(cls): rmtree('integration-cl') + rmtree('integration-cl-split') def test_cli(self): """Verify data.npz produced by CLI""" @@ -48,3 +55,22 @@ def test_cli(self): [0, 0, 0, 0, 0, 0, 1]] ) self.assertTrue(np.array_equal(data['y_test'], expected_y_test)) + + def test_cli_3way_split(self): + """Verify data.npz produced by CLI when split into train/test/val""" + + cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json' + cmd = cmd.split(' ') + subprocess.run(cmd, universal_newlines=True) + + data = np.load('integration-cl-split/data.npz') + + # validate our image data with shapes + self.assertEqual(data['x_train'].shape, (5, 256, 256, 3)) + self.assertEqual(data['x_test'].shape, (2, 256, 256, 3)) + self.assertEqual(data['x_val'].shape, (1, 256, 256, 3)) + + # validate label data with shapes + self.assertEqual(data['y_train'].shape, (5, 7)) + self.assertEqual(data['y_test'].shape, (2, 7)) + self.assertEqual(data['y_val'].shape, (1, 7)) \ No newline at end of file