Source code for radio.preprocessing.ct_masked_batch

# pylint: disable=no-member
# pylint: disable=too-many-public-methods
# pylint: disable=too-many-locals
# pylint: disable=too-many-arguments

""" Batch class CTImagesMaskedBatch for storing CT-scans with masks. """

import logging

import numpy as np
import pandas as pd
from numba import njit
from skimage import measure

try:
    from tqdm import tqdm_notebook
except ImportError:
    tqdm_notebook = lambda x: x

from .ct_batch import CTImagesBatch
from .mask import make_mask_numba, create_mask_reg
from .histo import sample_histo3d
from .crop import make_central_crop
from ..dataset import action, DatasetIndex, SkipBatchException  # pylint: disable=no-name-in-module


# logger initialization
logger = logging.getLogger(__name__) # pylint: disable=invalid-name


[docs]@njit(nogil=True) def get_nodules_numba(data, positions, size): """ Fetch nodules from array by starting positions. Takes array with data of shape (z, y, x) from `batch`, ndarray(p, 3) with starting indices of nodules where p is number of nodules and size of type ndarray(3, ) which contains sizes of nodules along each axis. The output is 3d ndarray with nodules put in CTImagesBatch-compatible skyscraper structure. Parameters ---------- data : ndarray CTImagesBatch `skyscraper` represented by 3D ndarray. positions : ndarray(l, 3) of int Contains nodules' starting indices along [zyx]-axis accordingly in `data`. size : ndarray(3,) of int Contains nodules' sizes along each axis (z,y,x). Notes ----- Dtypes of positions and size arrays must be the same. Returns ------- ndarray 3d ndarray with nodules """ out_arr = np.zeros((np.int(positions.shape[0]), size[0], size[1], size[2])) n_positions = positions.shape[0] for i in range(n_positions): out_arr[i, :, :, :] = data[positions[i, 0]: positions[i, 0] + size[0], positions[i, 1]: positions[i, 1] + size[1], positions[i, 2]: positions[i, 2] + size[2]] return out_arr.reshape(n_positions * size[0], size[1], size[2])
[docs]class CTImagesMaskedBatch(CTImagesBatch): """ Batch class for storing batch of ct-scans with masks for nodules. Allows to load info about cancer nodules, then create cancer-masks for each patient. Created masks are stored in self.masks Parameters ---------- index : dataset.index ids of scans to be put in a batch Attributes ---------- components : tuple of strings. List names of data components of a batch, which are `images`, `masks`, `origin` and `spacing`. NOTE: Implementation of this attribute is required by Base class. num_nodules : int number of nodules in batch images : ndarray contains ct-scans for all patients in batch. masks : ndarray contains masks for all patients in batch. nodules : np.recarray contains info on cancer nodules location. record array contains the following information about nodules: - self.nodules.nodule_center -- ndarray(num_nodules, 3) centers of nodules in world coords; - self.nodules.nodule_size -- ndarray(num_nodules, 3) sizes of nodules along z, y, x in world coord; - self.nodules.img_size -- ndarray(num_nodules, 3) sizes of images of patient data corresponding to nodules; - self.nodules.offset -- ndarray(num_nodules, 3) position of individual patient scan inside batch; - self.nodules.spacing -- ndarray(num_nodules, 3) of spacing attribute of patients which correspond to nodules; - self.nodules.origin -- ndarray(num_nodules, 3) of origin attribute of patients which correspond to nodules. """ nodules_dtype = np.dtype([('patient_pos', np.int, 1), ('offset', np.int, (3,)), ('img_size', np.int, (3,)), ('nodule_center', np.float, (3,)), ('nodule_size', np.float, (3,)), ('spacing', np.float, (3,)), ('origin', np.float, (3,))]) components = "images", "masks", "spacing", "origin"
[docs] @staticmethod def make_indices(size): """ Generate list of batch indices of given `size`. Parameters ---------- size : int size of list with indices Returns ------- list list of random indices Examples -------- >>> indices = CTImagesMaskedBatch.make_indices(20) >>> indices array(['3c3eb09b', '5b192d1f', 'f28ddbb0', '14460196', '31a92510', '3f324e44', '066ccf28', '5570938d', '5d1fb8f6', '539ea09c', '68f9f235', '8f7b0c49', 'c7903591', 'dc8e9504', '54e9eebc', '778abd5a', '99691fc6', '7da49e85', '0f343345', '876fb9e6'], dtype='<U8') """ return np.array([CTImagesMaskedBatch.make_filename() for i in range(size)])
def __init__(self, index, *args, **kwargs): """ Execute Batch construction and init of basic attributes Parameters ---------- index : Dataset.Index class. Required indexing of objects (files). """ super().__init__(index, *args, **kwargs) self.masks = None self.nodules = None
[docs] def nodules_to_df(self, nodules): """ Convert nodules_info ndarray into pandas dataframe. Pandas DataFrame will contain following columns: 'source_id' - id of source element of batch; 'nodule_id' - generated id for nodules; 'locZ', 'locY', 'locX' - coordinates of nodules' centers; 'diamZ', 'diamY', 'diamX' - sizes of nodules along zyx axes; Parameters ---------- nodules : ndarray of type nodules_info nodules_info type is defined inside of CTImagesMaskedBatch class. Returns ------- pd.DataFrame centers, ids and sizes of nodules. """ columns = ['nodule_id', 'source_id', 'locZ', 'locY', 'locX', 'diamZ', 'diamY', 'diamX'] nodule_id = self.make_indices(nodules.shape[0]) return pd.DataFrame({'source_id': self.indices[nodules.patient_pos], 'nodule_id': nodule_id, 'locZ': nodules.nodule_center[:, 0], 'locY': nodules.nodule_center[:, 1], 'locX': nodules.nodule_center[:, 2], 'diamZ': nodules.nodule_size[:, 0], 'diamY': nodules.nodule_size[:, 1], 'diamX': nodules.nodule_size[:, 2]}, columns=columns)
[docs] def get_pos(self, data, component, index): """ Return a positon of an item for a given index in data or in self.`component`. Fetch correct position inside batch for an item, looks for it in `data`, if provided, or in `component` in self. Parameters ---------- data : None or ndarray data from which subsetting is done. If None, retrieve position from `component` of batch, if ndarray, returns index. component : str name of a component, f.ex. 'images'. if component provided, data should be None. index : str or int index of an item to be looked for. may be key from dataset (str) or index inside batch (int). Returns ------- int Position of item Notes ----- This is an overload of get_pos from base Batch-class, see corresponding docstring for detailed explanation. """ if data is None: ind_pos = self._get_verified_pos(index) if component in ['images', 'masks']: return slice(self.lower_bounds[ind_pos], self.upper_bounds[ind_pos]) else: return slice(ind_pos, ind_pos + 1) else: return index
@property def num_nodules(self): """ Get number of nodules in CTImagesMaskedBatch. Returns ------- int number of nodules in CTImagesMaskedBatch. if fetch_nodules_info method has not been called yet returns 0. """ if self.nodules is not None: return self.nodules.patient_pos.shape[0] else: return 0
[docs] @action def fetch_nodules_info(self, nodules=None, nodules_records=None, update=False, images_loaded=True): """Extract nodules' info from nodules into attribute self.nodules. Parameters ---------- nodules : pd.DataFrame contains: - 'seriesuid': index of patient or series. - 'coordZ','coordY','coordX': coordinates of nodules center. - 'diameter_mm': diameter, in mm. nodules_records : np.recarray if not None, should contain the same fields as describe in Note. update : bool if False, warning appears to remind that nodules info will be earased and recomputed. images_loaded : bool if True, i.e. `images` component is loaded, and image_size is used to compute correct nodules location inside `skyscraper`. If False, it doesn't update info of location inside `skyscraper`. Returns ------- batch Notes ----- Run this action only after :func:`~radio.CTImagesBatch.load`. The method fills in record array self.nodules that contains the following information about nodules: - self.nodules.nodule_center -- ndarray(num_nodules, 3) centers of nodules in world coords; - self.nodules.nodule_size -- ndarray(num_nodules, 3) sizes of nodules along z, y, x in world coord; - self.nodules.img_size -- ndarray(num_nodules, 3) sizes of images of patient data corresponding to nodules; - self.nodules.offset -- ndarray(num_nodules, 3) of biases of patients which correspond to nodules; - self.nodules.spacing -- ndarray(num_nodules, 3) of spacinf attribute of patients which correspond to nodules; - self.nodules.origin -- ndarray(num_nodules, 3) of origin attribute of patients which correspond to nodules. - self.nodules.patient_pos -- ndarray(num_nodules, 1) refers to positions of patients which correspond to stored nodules. """ if self.nodules is not None and not update: logger.warning("Nodules have already been extracted. " + "Put update argument as True for refreshing") return self if nodules_records is not None: # load from record-array self.nodules = nodules_records else: # assume that nodules is supplied and load from it required_columns = np.array(['seriesuid', 'diameter_mm', 'coordZ', 'coordY', 'coordX']) if not (isinstance(nodules, pd.DataFrame) and np.all(np.in1d(required_columns, nodules.columns))): raise ValueError(("Argument 'nodules' must be pandas DataFrame" + " with {} columns. Make sure that data provided" + " in correct format.").format(required_columns.tolist())) nodules_df = nodules.set_index('seriesuid') unique_indices = nodules_df.index.unique() inter_index = np.intersect1d(unique_indices, self.indices) nodules_df = nodules_df.loc[inter_index, ["coordZ", "coordY", "coordX", "diameter_mm"]] num_nodules = nodules_df.shape[0] self.nodules = np.rec.array(np.zeros(num_nodules, dtype=self.nodules_dtype)) counter = 0 for pat_id, coordz, coordy, coordx, diam in nodules_df.itertuples(): pat_pos = self.index.get_pos(pat_id) self.nodules.patient_pos[counter] = pat_pos self.nodules.nodule_center[counter, :] = np.array([coordz, coordy, coordx]) self.nodules.nodule_size[counter, :] = np.array([diam, diam, diam]) counter += 1 self._refresh_nodules_info(images_loaded) return self
[docs] @action def fetch_nodules_from_mask(self, images_loaded=True): """ Fetch nodules info (centers and sizes) from masks. Runs skimage.measure.labels for fetching nodules regions from masks. Extracts nodules info from segmented regions and put this information in self.nodules np.recarray. Parameters ---------- images_loaded : bool if True, i.e. `images` component is loaded, and image_size is used to compute correct nodules location inside `skyscraper`. If False, it doesn't update info of location inside `skyscraper`. Returns ------- batch Notes ----- Sizes along [zyx] will be the same. """ nodules_list = [] for pos in range(len(self)): mask = self.get(pos, 'masks') mask_labels = measure.label(mask, background=0) for props in measure.regionprops(np.int16(mask_labels)): center = np.asarray((props.centroid[0], props.centroid[1], props.centroid[2]), dtype=np.float) center = center * self.spacing[pos] + self.origin[pos] diameter = np.asarray( [props.equivalent_diameter] * 3, dtype=np.float) diameter = diameter * self.spacing[pos] nodules_list.append({'patient_pos': pos, 'nodule_center': center, 'nodule_size': diameter}) num_nodules = len(nodules_list) self.nodules = np.rec.array( np.zeros(num_nodules, dtype=self.nodules_dtype)) for i, nodule in enumerate(nodules_list): self.nodules.patient_pos[i] = nodule['patient_pos'] self.nodules.nodule_center[i, :] = nodule['nodule_center'] self.nodules.nodule_size[i, :] = nodule['nodule_size'] self._refresh_nodules_info(images_loaded) return self
# TODO: another name of method def _fit_into_bounds(self, size, variance=None): """ Fetch start voxel coordinates of all nodules. Get start voxel coordinates of all nodules in batch. Note that all nodules are considered to have fixed same size defined by argument size: if nodule is out of patient's 3d image bounds than it's center is shifted to border. Parameters ---------- size : list or tuple of ndarrays ndarray(3, ) with diameters of nodules in (z,y,x). variance : ndarray(3, ) diagonal elements of multivariate normal distribution, for sampling random shifts along (z,y,x) correspondingly. Returns ------- ndarray start coordinates (z,y,x) of all nodules in batch. """ size = np.array(size, dtype=np.int) center_pix = np.abs(self.nodules.nodule_center - self.nodules.origin) / self.nodules.spacing start_pix = (np.rint(center_pix) - np.rint(size / 2)) if variance is not None: start_pix += np.random.multivariate_normal(np.zeros(3), np.diag(variance), self.nodules.patient_pos.shape[0]) end_pix = start_pix + size bias_upper = np.maximum(end_pix - self.nodules.img_size, 0) start_pix -= bias_upper end_pix -= bias_upper bias_lower = np.maximum(-start_pix, 0) start_pix += bias_lower end_pix += bias_lower return (start_pix + self.nodules.offset).astype(np.int)
[docs] @action def create_mask(self): """ Create `masks` component from `nodules` component. Notes ----- `nodules` must be not None before calling this method. see :func:`~radio.preprocessing.ct_masked_batch.CTImagesMaskedBatch.fetch_nodules_info` for more details. """ if self.nodules is None: logger.warning("Info about nodules location must " + "be loaded before calling this method. " + "Nothing happened.") self.masks = np.zeros_like(self.images) center_pix = np.abs(self.nodules.nodule_center - self.nodules.origin) / self.nodules.spacing start_pix = (center_pix - np.rint(self.nodules.nodule_size / self.nodules.spacing / 2)) start_pix = np.rint(start_pix).astype(np.int) make_mask_numba(self.masks, self.nodules.offset, self.nodules.img_size + self.nodules.offset, start_pix, np.rint(self.nodules.nodule_size / self.nodules.spacing)) return self
[docs] def fetch_mask(self, shape): """ Create `masks` component of different size then `images`, using `nodules` component. Parameters ---------- shape : tuple, list or ndarray of int. (z_dim,y_dim,x_dim), shape of mask to be created. Returns ------- ndarray 3d array with masks in form of `skyscraper`. # TODO: one part of code from here repeats create_mask function better to unify these two func """ if self.nodules is None: logger.warning("Info about nodules location must " + "be loaded before calling this method. " + "Nothing happened.") mask = np.zeros(shape=(len(self) * shape[0], *shape[1:])) # infer scale factor; assume patients are already resized to equal # shapes scale_factor = np.asarray(shape) / self.images_shape[0, :] # get rescaled nodule-centers, nodule-sizes, offsets, locs of nod # starts center_scaled = (np.abs(self.nodules.nodule_center - self.nodules.origin) / self.nodules.spacing * scale_factor) start_scaled = (center_scaled - scale_factor * self.nodules.nodule_size / self.nodules.spacing / 2) start_scaled = np.rint(start_scaled).astype(np.int) offset_scaled = np.rint(self.nodules.offset * scale_factor).astype(np.int) img_size_scaled = np.rint( self.nodules.img_size * scale_factor).astype(np.int) nod_size_scaled = (np.rint(scale_factor * self.nodules.nodule_size / self.nodules.spacing)).astype(np.int) # put nodules into mask make_mask_numba(mask, offset_scaled, img_size_scaled + offset_scaled, start_scaled, nod_size_scaled) # return ndarray-mask return mask
# TODO rename function to sample_random_nodules_positions
[docs] def sample_random_nodules(self, num_nodules, nodule_size, histo=None): """ Sample random nodules positions in CTImagesBatchMasked. Samples random nodules positions in ndarray. Each nodule have shape defined by `nodule_size`. If size of patients' data along z-axis is not the same for different patients, NotImplementedError will be raised. Parameters ---------- num_nodules : int number of nodules to sample from dataset. nodule_size : ndarray(3, ) crop shape along (z,y,x). histo : tuple np.histogram()'s output. 3d-histogram, represented by tuple (bins, edges). Returns ------- ndarray ndarray(num_nodules, 3). 1st array's dim is an index of sampled nodules, 2nd points out start positions (integers) of nodules in batch `skyscraper`. """ all_indices = np.arange(len(self)) sampled_indices = np.random.choice( all_indices, num_nodules, replace=True) offset = np.zeros((num_nodules, 3)) offset[:, 0] = self.lower_bounds[sampled_indices] data_shape = self.images_shape[sampled_indices, :] # if supplied, use histogram as the sampler if histo is None: sampler = lambda size: np.random.rand(size, 3) else: sampler = lambda size: sample_histo3d(histo, size) samples = sampler(size=num_nodules) * (data_shape - nodule_size) if histo is not None: samples /= data_shape return np.asarray(samples + offset, dtype=np.int), sampled_indices
[docs] @action def sample_nodules(self, batch_size, nodule_size=(32, 64, 64), share=0.8, variance=None, # pylint: disable=too-many-locals, too-many-statements mask_shape=None, histo=None): """ Sample random crops of `images` and `masks` from batch. Create random crops, both with and without nodules in it, from input batch. Parameters ---------- batch_size : int number of nodules in the output batch. Required, if share=0.0. If None, resulting batch will include all cancerous nodules. nodule_size : tuple, list or ndarray of int crop shape along (z,y,x). share : float share of cancer crops in the batch. if input CTImagesBatch contains less cancer nodules than needed random nodules will be taken. variance : tuple, list or ndarray of float variances of normally distributed random shifts of nodules' start positions. mask_shape : tuple, list or ndarray of int size of `masks` crop in (z,y,x)-order. If not None, crops with masks would be of mask_shape. If None, mask crop shape would be equal to crop_size. histo : tuple np.histogram()'s output. Used for sampling non-cancerous crops. Returns ------- Batch batch with cancerous and non-cancerous crops in a proportion defined by `share` with total `batch_size` nodules. If `share` == 1.0, `batch_size` is None, resulting batch consists of all cancerous crops stored in batch. """ # make sure that nodules' info is fetched and args are OK if self.nodules is None: raise AttributeError("Info about nodules location must " + "be loaded before calling this method") if variance is not None: variance = np.asarray(variance, dtype=np.int) variance = variance.flatten() if len(variance) != 3: logger.warning('Argument variance be np.array-like' + 'and has shape (3,). ' + 'Would be used no-scale-shift.') variance = None if share == 0.0 and batch_size is None: raise ValueError('Either supply batch_size or set share to positive number') # pos of batch-items that correspond to crops crops_indices = np.zeros(0, dtype=np.int16) # infer the number of cancerous nodules and the size of batch batch_size = batch_size if batch_size is not None else 1.0 / share * self.num_nodules cancer_n = int(share * batch_size) batch_size = int(batch_size) cancer_n = self.num_nodules if cancer_n > self.num_nodules else cancer_n if batch_size == 0: raise SkipBatchException('Batch of zero size cannot be passed further through the workflow') # choose cancerous nodules' starting positions nodule_size = np.asarray(nodule_size, dtype=np.int) if self.num_nodules == 0: cancer_nodules = np.zeros((0, 3)) else: # adjust cancer nodules' starting positions s.t. nodules fit into # scan-boxes cancer_nodules = self._fit_into_bounds( nodule_size, variance=variance) # randomly select needed number of cancer nodules (their starting # positions) sample_indices = np.random.choice(np.arange(self.num_nodules), size=cancer_n, replace=False) cancer_nodules = cancer_nodules[sample_indices, :] # store scans-indices for chosen crops cancerous_indices = self.nodules.patient_pos[sample_indices].reshape(-1) crops_indices = np.concatenate([crops_indices, cancerous_indices]) nodules_st_pos = cancer_nodules # if non-cancerous nodules are needed, add random starting pos if batch_size - cancer_n > 0: # sample starting positions for (most-likely) non-cancerous crops random_nodules, random_indices = self.sample_random_nodules(batch_size - cancer_n, nodule_size, histo=histo) # concat non-cancerous and cancerous crops' starting positions nodules_st_pos = np.vstack([nodules_st_pos, random_nodules]).astype( np.int) # pylint: disable=no-member # store scan-indices for randomly chose crops crops_indices = np.concatenate([crops_indices, random_indices]) # obtain nodules' scans by cropping from self.images images = get_nodules_numba(self.images, nodules_st_pos, nodule_size) # if mask_shape not None, compute scaled mask for the whole batch # scale also nodules' starting positions and nodules' shapes if mask_shape is not None: scale_factor = np.asarray(mask_shape) / np.asarray(nodule_size) batch_mask_shape = np.rint( scale_factor * self.images_shape[0, :]).astype(np.int) batch_mask = self.fetch_mask(batch_mask_shape) nodules_st_pos = np.rint( scale_factor * nodules_st_pos).astype(np.int) else: batch_mask = self.masks mask_shape = nodule_size # crop nodules' masks masks = get_nodules_numba(batch_mask, nodules_st_pos, mask_shape) # build nodules' batch bounds = np.arange(batch_size + 1) * nodule_size[0] crops_spacing = self.spacing[crops_indices] offset = np.zeros((batch_size, 3)) offset[:, 0] = self.lower_bounds[crops_indices] crops_origin = self.origin[crops_indices] + crops_spacing * (nodules_st_pos - offset) names_gen = zip(self.indices[crops_indices], self.make_indices(batch_size)) ix_batch = ['_'.join([prefix, random_str]) for prefix, random_str in names_gen] nodules_batch = type(self)(DatasetIndex(ix_batch)) nodules_batch._init_data(images=images, bounds=bounds, spacing=crops_spacing, origin=crops_origin, masks=masks) # pylint: disable=protected-access # set nodules info in nodules' batch nodules_records = [self.nodules[self.nodules.patient_pos == crop_pos] for crop_pos in crops_indices] new_patient_pos = [] for i, records in enumerate(nodules_records): new_patient_pos += [i] * len(records) new_patient_pos = np.array(new_patient_pos) nodules_records = np.concatenate(nodules_records) nodules_records = nodules_records.view(np.recarray) nodules_records.patient_pos = new_patient_pos nodules_batch.fetch_nodules_info(nodules_records=nodules_records) # leave out nodules with zero-intersection with crops' boxes nodules_batch._filter_nodules_info() # pylint: disable=protected-access return nodules_batch
[docs] @action def sample_dump(self, dst, n_iters, nodule_size=(32, 64, 64), batch_size=20, share=0.8, **kwargs): """ Perform sample_nodules and dump on the same batch n_iters times. Can be used for fast creation of large datasets of cancerous/non-cancerous crops. Parameters ---------- dst : str folder to dump nodules in. n_iters : int number of iterations to be performed. nodule_size : tuple, list or ndarray of int (z,y,x)-shape of sampled nodules. batch_size : int or None size of generated batches. share : float share of cancer nodules. See docstring of sample_nodules for more info about possible combinations of parameters share and batch_size. **kwargs : dict additional arguments supplied into sample_nodules. See docstring of sample_nodules for more info. """ for _ in range(n_iters): nodules = self.sample_nodules(batch_size=batch_size, nodule_size=nodule_size, share=share, **kwargs) nodules = nodules.dump(dst=dst) return self
[docs] @action def update_nodules_histo(self, histo): """ Update histogram of nodules' locations using nodules locations from batch. Parameters ---------- histo : list list(np.histogram()), used for sampling cancerous locations. Notes ----- Execute action only after .fetch_nodules_info(). """ # infer bins' bounds from histo bins = histo[1] # get cancer_nodules' centers in voxel coords center_pix = np.abs(self.nodules.nodule_center - self.nodules.origin) / self.nodules.spacing # update bins of histo histo_delta = np.histogramdd(center_pix, bins=bins) histo[0] += histo_delta[0] return self
[docs] def get_axial_slice(self, patient_pos, height): """ Get tuple of `images` slice and `masks` slice by patient and slice position. Parameters ---------- patient_pos : int patient position in the batch height : float number of slice (z-axis), scaled to [0:1] used to get slice with position: int(height * number_of slices_for_patient) from patient's scan and mask. Returns ------- tuple (images_slice,masks_slice) by patient_pos and number of slice """ margin = int(height * self.get(patient_pos, 'images').shape[0]) if self.masks is not None: patch = (self.get(patient_pos, 'images')[margin, :, :], self.get(patient_pos, 'masks')[margin, :, :]) else: patch = (self.get(patient_pos, 'images')[margin, :, :], None) return patch
def _refresh_nodules_info(self, images_loaded=True): """ Refresh self.nodules attributes [spacing, origin, img_size, bias]. This method is called to update [spacing, origin, img_size, bias] attributes of self.nodules because batch's inner data has changed, e.g. after resize. Parameters ---------- images_loaded : bool if True, assumes that `_bounds` attribute is computed, i.e. either `masks` and/or `images` are loaded. """ if images_loaded: self.nodules.offset[:, 0] = self.lower_bounds[ self.nodules.patient_pos] self.nodules.img_size = self.images_shape[ self.nodules.patient_pos, :] self.nodules.spacing = self.spacing[self.nodules.patient_pos, :] self.nodules.origin = self.origin[self.nodules.patient_pos, :] def _filter_nodules_info(self): """ Filter record-array self.nodules s.t. only records about cancerous nodules that have non-zero intersection with scan-boxes be present. Notes ----- can be called only after execution of fetch_nodules_info and _refresh_nodules_info """ # nodules start and trailing pixel-coords center_pix = (self.nodules.nodule_center - self.nodules.origin) / self.nodules.spacing start_pix = center_pix - np.rint(self.nodules.nodule_size / self.nodules.spacing / 2) start_pix = np.rint(start_pix).astype(np.int) end_pix = start_pix + np.rint(self.nodules.nodule_size / self.nodules.spacing) # find nodules with no intersection with scan-boxes nods_images_shape = self.images_shape[self.nodules.patient_pos] start_mask = np.any(start_pix >= nods_images_shape, axis=1) end_mask = np.any(end_pix <= 0, axis=1) zero_mask = start_mask | end_mask # filter out such nodules self.nodules = self.nodules[~zero_mask] def _rescale_spacing(self): """ Rescale spacing values and call _refresh_nodules_info(). Method is called after any operation that changes shape of inner data. """ if self.nodules is not None: self._refresh_nodules_info() return self def _post_mask(self, list_of_arrs, **kwargs): """ Concatenate outputs of different workers and put the result in `masks` Parameters ---------- list_of_arrs : list list of ndarrays of patients' masks. """ self._reraise_worker_exceptions(list_of_arrs) new_masks = np.concatenate(list_of_arrs, axis=0) self.masks = new_masks return self def _init_load_blosc(self, **kwargs): """ Init-func for load from blosc. Fills images/masks-components with zeroes if the components are to be updated. Parameters ---------- **kwargs components : str, list or tuple iterable of components names that need to be loaded Returns ------- list list of ids of batch-items, i.e. series ids or patient ids. """ # fill 'images', 'masks'-comps with zeroes if needed skysc_components = {'images', 'masks'} & set(kwargs['components']) self._prealloc_skyscraper_components(skysc_components) return self.indices def _post_rebuild(self, all_outputs, new_batch=False, **kwargs): """ Gather outputs of different workers, rebuild `images` and `masks`. Parameters ---------- all_outputs : list list of outputs. Each item is given by tuple. new_batch : bool if True, returns new batch with data agregated from all_ouputs. if False, changes self. **kwargs shape : list, tuple or ndarray of int (z,y,x)-shape of every image in image component after action is performed. spacing : tuple, list or ndarray of float (z,y,x)-spacing for each image. If supplied, assume that unify_spacing is performed. Returns ------- batch """ # TODO: process errors batch = super()._post_rebuild(all_outputs, new_batch, **kwargs) batch.nodules = self.nodules batch._rescale_spacing() # pylint: disable=protected-access if self.masks is not None: batch.create_mask() return batch
[docs] @action def make_xip(self, depth, stride=1, mode='max', projection='axial', padding='reflect', **kwargs): """ Make intensity projection (maximum, minimum, mean or median). Notice that axis is chosen according to projection argument. Parameters ---------- depth : int number of slices over which xip operation is performed. stride : int stride-step along projection dimension. mode : str Possible values are 'max', 'min', 'mean' or 'median'. projection : str Possible values: 'axial', 'coronal', 'sagital'. In case of 'coronal' and 'sagital' projections tensor will be transposed from [z,y,x] to [x,z,y] and [y,z,x]. padding : str mode of padding that will be passed in numpy.padding function. """ if projection == 'axial': _projection = 0 elif projection == 'coronal': _projection = 1 elif projection == 'sagital': _projection = 2 batch = super().make_xip(stride=stride, depth=depth, mode=mode, projection=projection, padding=padding, **kwargs) if self.nodules is not None: projection_spacing = self.nodules.spacing[:, _projection] batch.nodules = self.nodules batch.nodules.nodule_size[:, _projection] += (depth * projection_spacing) # pylint: disable=unsubscriptable-object batch._rescale_spacing() # pylint: disable=protected-access if self.masks is not None: batch.create_mask() return batch
[docs] @action def central_crop(self, crop_size, crop_mask=False, **kwargs): """ Make crop of crop_size from center of images. Parameters ---------- crop_size : tuple, list or ndarray of int (z,y,x)-shape of central crop along three axes(z,y,x order is used). crop_mask : bool if True, crop the mask in the same way. Returns ------- batch """ crop_size = np.asarray(crop_size).reshape(-1) crop_halfsize = np.rint(crop_size / 2) img_shapes = [np.asarray(self.get(i, 'images').shape) for i in range(len(self))] if any(np.any(shape < crop_size) for shape in img_shapes): raise ValueError( "Crop size must be smaller than size of inner 3D images") cropped_images = [] cropped_masks = [] for i in range(len(self)): image = self.get(i, 'images') cropped_images.append(make_central_crop(image, crop_size)) if crop_mask and self.masks is not None: mask = self.get(i, 'masks') cropped_masks.append(make_central_crop(mask, crop_size)) self._bounds = np.cumsum([0] + [crop_size[0]] * len(self)) self.images = np.concatenate(cropped_images, axis=0) if crop_mask and self.masks is not None: self.masks = np.concatenate(cropped_masks, axis=0) # recalculate origin, refresh nodules_info, leave only relevant nodules self.origin = self.origin + self.spacing * crop_halfsize if self.nodules is not None: self._refresh_nodules_info() self._filter_nodules_info() return self
[docs] def flip(self): # pylint: disable=arguments-differ """ Invert the order of slices for each patient Returns ------- batch Examples -------- >>> batch = batch.flip() """ logger.warning("There is no implementation of flip method for class " + "CTIMagesMaskedBatch. Nothing happened") return self
[docs] @action def binarize_mask(self, threshold=0.35): """ Binarize masks by threshold. Parameters ---------- threshold : float threshold for masks binarization. """ self.masks *= np.asarray(self.masks > threshold, dtype=np.int) return self
[docs] @action def predict_on_scan(self, model_name, strides=(16, 32, 32), crop_shape=(32, 64, 64), batch_size=4, targets_mode='segmentation', data_format='channels_last', show_progress=True, model_type='tf'): """ Get predictions of the model on data contained in batch. Transforms scan data into patches of shape CROP_SHAPE and then feed this patches sequentially into model with name specified by argument 'model_name'; after that loads predicted masks or probabilities into 'masks' component of the current batch and returns it. Parameters ---------- model_name : str name of model that will be used for predictions. strides : tuple, list or ndarray of int (z,y,x)-strides for patching operation. crop_shape : tuple, list or ndarray of int (z,y,x)-shape of crops. batch_size : int number of patches to feed in model in one iteration. targets_mode: str type of targets 'segmentation', 'regression' or 'classification'. data_format: str format of neural network input data, can be 'channels_first' or 'channels_last'. model_type : str represents type of model that will be used for prediction. Possible values are 'keras' or 'tf'. Returns ------- CTImagesMaskedBatch. """ _model = self.get_model_by_name(model_name) crop_shape = np.asarray(crop_shape).reshape(-1) strides = np.asarray(strides).reshape(-1) patches_arr = self.get_patches(patch_shape=crop_shape, stride=strides, padding='reflect') if data_format == 'channels_first': patches_arr = patches_arr[:, np.newaxis, ...] elif data_format == 'channels_last': patches_arr = patches_arr[..., np.newaxis] predictions = [] iterations = range(0, patches_arr.shape[0], batch_size) if show_progress: iterations = tqdm_notebook(iterations) # pylint: disable=redefined-variable-type for i in iterations: if model_type == 'tf': _prediction = _model.predict(feed_dict={'images': patches_arr[i: i + batch_size, ...]}) else: _prediction = _model.predict(patches_arr[i: i + batch_size, ...]) current_prediction = np.asarray(_prediction) if targets_mode == 'classification': current_prediction = np.stack([np.ones(shape=(crop_shape)) * prob for prob in current_prediction.ravel()]) if targets_mode == 'regression': current_prediction = create_mask_reg(current_prediction[:, :3], current_prediction[:, 3:6], current_prediction[:, 6], crop_shape, 0.01) predictions.append(current_prediction) patches_mask = np.concatenate(predictions, axis=0) patches_mask = np.squeeze(patches_mask) self.load_from_patches(patches_mask, stride=strides, scan_shape=tuple(self.images_shape[0, :]), data_attr='masks') return self
[docs] def unpack(self, component='images', **kwargs): """ Basic way for unpacking components from batch. Parameters ---------- component : str component to unpack, can be 'images' or 'masks'. data_format : str can be 'channels_last' or 'channels_first'. Reflects where to put channels dimension: right after batch dimension or after all spatial axes. kwargs : dict key-word arguments that will be passed in callable if component argument reffers to method of batch class. Returns ------- ndarray(batch_size, ...) or None """ if not hasattr(self, component): return None if component in ('images', 'masks'): data_format = kwargs.get('data_format', 'channels_last') if np.all(self.images_shape == self.images_shape[0, :]): value = self.get(None, component).reshape(-1, *self.images_shape[0, :]) else: value = np.stack([self.get(i, component) for i in range(len(self))]) if data_format == 'channels_last': value = value[..., np.newaxis] elif data_format == 'channels_first': value = value[:, np.newaxis, ...] else: attr_value = getattr(self, component) if callable(attr_value): value = attr_value(**kwargs) else: value = attr_value return value
[docs] def classification_targets(self, threshold=10, **kwargs): """ Unpack data from batch in format suitable for classification task. Parameters ---------- threshold : int minimum number of '1' pixels in mask to consider it cancerous. Returns ------- ndarray(batch_size, 1) targets for classification task: labels corresponding to cancerous nodules ('1') and non-cancerous nodules ('0'). """ masks_labels = np.asarray([self.get(i, 'masks').sum() > threshold for i in range(len(self))], dtype=np.int) return masks_labels[..., np.newaxis]
[docs] def regression_targets(self, threshold=10, **kwargs): """ Unpack data from batch in format suitable for regression task. Parameters ---------- threshold : int minimum number of '1' pixels in mask to consider it cancerous. Returns ------- ndarray(batch_size, 7) targets for regression task: cancer center, size and label(1 for cancerous and 0 for non-cancerous). Note that in case of non-cancerous crop first 6 column of output array will be set to zero. """ nodules = self.nodules sizes = np.zeros(shape=(len(self), 3), dtype=np.float) centers = np.zeros(shape=(len(self), 3), dtype=np.float) for item_pos, _ in enumerate(self.indices): item_nodules = nodules[nodules.patient_pos == item_pos] if len(item_nodules) == 0: continue mask_nod_indices = item_nodules.nodule_size.max(axis=1).argmax() nodule_sizes = (item_nodules.nodule_size / self.spacing[item_pos, :] / self.images_shape[item_pos, :]) nodule_centers = (item_nodules.nodule_center / self.spacing[item_pos, :] / self.images_shape[item_pos, :]) sizes[item_pos, :] = nodule_sizes[mask_nod_indices, :] centers[item_pos, :] = nodule_centers[mask_nod_indices, :] labels = self.unpack('classification_targets', threshold=threshold) reg_targets = np.concatenate([centers, sizes, labels], axis=1) return reg_targets
[docs] def segmentation_targets(self, data_format='channels_last', **kwargs): """ Unpack data from batch in format suitable for regression task. Parameters ---------- data_format : str data_format shows where to put new axis for channels dimension: can be 'channels_last' or 'channels_first'. Returns ------- ndarray(batch_size, ...) batch array with masks. """ return self.unpack('masks', data_format=data_format)
[docs] @staticmethod def make_data_tf(batch, model=None, mode='segmentation', is_training=True, **kwargs): """ Prepare data in batch for training neural network implemented in tensorflow. Parameters ---------- mode : str mode can be one of following 'classification', 'regression' or 'segmentation'. Default is 'segmentation'. data_format : str data format batch data. Can be 'channels_last' or 'channels_first'. Default is 'channels_last'. is_training : bool whether model is in training or prediction mode. Default is True. threshold : int threshold value of '1' pixels in masks to consider it cancerous. Default is 10. Returns ------- dict or None feed dict and fetches for training neural network. """ inputs = batch.unpack('images', **kwargs) if mode in ['segmentation', 'classification', 'regression']: labels = batch.unpack(mode + '_targets', **kwargs) else: raise ValueError("Argument 'mode' must have one of values: " + "'segmentation', 'classification' or 'regression'") feed_dict = dict(images=inputs, labels=labels) if is_training else dict(images=inputs) return dict(feed_dict=feed_dict, fetches=None)
[docs] @staticmethod def make_data_keras(batch, model=None, mode='segmentation', is_training=True, **kwargs): """ Prepare data in batch for training neural network implemented in keras. Parameters ---------- mode : str mode can be one of following 'classification', 'regression' or 'segmentation'. Default is 'segmentation'. data_format : str data format batch data. Can be 'channels_last' or 'channels_first'. Default is 'channels_last'. is_training : bool whether model is in training or prediction mode. Default is True. threshold : int threshold value of '1' pixels in masks to consider it cancerous. Default is 10. Returns ------- dict or None kwargs for keras model train method: {'x': ndarray(...), 'y': ndarrray(...)} for training neural network. """ inputs = batch.unpack('images', **kwargs) if mode in ['segmentation', 'classification', 'regression']: labels = batch.unpack(mode + '_targets', **kwargs) else: raise ValueError("Argument 'mode' must have one of values: " + "'segmentation', 'classification' or 'regression'") return dict(x=inputs, y=labels) if is_training else dict(x=inputs)