# pylint: disable=no-member
# pylint: disable=too-many-public-methods
# pylint: disable=too-many-locals
# pylint: disable=too-many-arguments
""" Batch class CTImagesMaskedBatch for storing CT-scans with masks. """
import logging
import numpy as np
import pandas as pd
from numba import njit
from skimage import measure
try:
from tqdm import tqdm_notebook
except ImportError:
tqdm_notebook = lambda x: x
from .ct_batch import CTImagesBatch
from .mask import make_mask_numba, create_mask_reg
from .histo import sample_histo3d
from .crop import make_central_crop
from ..dataset import action, DatasetIndex, SkipBatchException # pylint: disable=no-name-in-module
# logger initialization
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
[docs]@njit(nogil=True)
def get_nodules_numba(data, positions, size):
""" Fetch nodules from array by starting positions.
Takes array with data of shape (z, y, x) from `batch`,
ndarray(p, 3) with starting indices of nodules where p is number
of nodules and size of type ndarray(3, ) which contains
sizes of nodules along each axis. The output is 3d ndarray with nodules
put in CTImagesBatch-compatible skyscraper structure.
Parameters
----------
data : ndarray
CTImagesBatch `skyscraper` represented by 3D ndarray.
positions : ndarray(l, 3) of int
Contains nodules' starting indices along [zyx]-axis accordingly in `data`.
size : ndarray(3,) of int
Contains nodules' sizes along each axis (z,y,x).
Notes
-----
Dtypes of positions and size arrays must be the same.
Returns
-------
ndarray
3d ndarray with nodules
"""
out_arr = np.zeros((np.int(positions.shape[0]), size[0], size[1], size[2]))
n_positions = positions.shape[0]
for i in range(n_positions):
out_arr[i, :, :, :] = data[positions[i, 0]: positions[i, 0] + size[0],
positions[i, 1]: positions[i, 1] + size[1],
positions[i, 2]: positions[i, 2] + size[2]]
return out_arr.reshape(n_positions * size[0], size[1], size[2])
[docs]class CTImagesMaskedBatch(CTImagesBatch):
""" Batch class for storing batch of ct-scans with masks for nodules.
Allows to load info about cancer nodules, then create cancer-masks
for each patient. Created masks are stored in self.masks
Parameters
----------
index : dataset.index
ids of scans to be put in a batch
Attributes
----------
components : tuple of strings.
List names of data components of a batch, which are `images`,
`masks`, `origin` and `spacing`.
NOTE: Implementation of this attribute is required by Base class.
num_nodules : int
number of nodules in batch
images : ndarray
contains ct-scans for all patients in batch.
masks : ndarray
contains masks for all patients in batch.
nodules : np.recarray
contains info on cancer nodules location.
record array contains the following information about nodules:
- self.nodules.nodule_center -- ndarray(num_nodules, 3) centers of
nodules in world coords;
- self.nodules.nodule_size -- ndarray(num_nodules, 3) sizes of
nodules along z, y, x in world coord;
- self.nodules.img_size -- ndarray(num_nodules, 3) sizes of images of
patient data corresponding to nodules;
- self.nodules.offset -- ndarray(num_nodules, 3) position of individual
patient scan inside batch;
- self.nodules.spacing -- ndarray(num_nodules, 3) of spacing attribute
of patients which correspond to nodules;
- self.nodules.origin -- ndarray(num_nodules, 3) of origin attribute
of patients which correspond to nodules.
"""
nodules_dtype = np.dtype([('patient_pos', np.int, 1),
('offset', np.int, (3,)),
('img_size', np.int, (3,)),
('nodule_center', np.float, (3,)),
('nodule_size', np.float, (3,)),
('spacing', np.float, (3,)),
('origin', np.float, (3,))])
components = "images", "masks", "spacing", "origin"
[docs] @staticmethod
def make_indices(size):
""" Generate list of batch indices of given `size`.
Parameters
----------
size : int
size of list with indices
Returns
-------
list
list of random indices
Examples
--------
>>> indices = CTImagesMaskedBatch.make_indices(20)
>>> indices
array(['3c3eb09b', '5b192d1f', 'f28ddbb0', '14460196', '31a92510',
'3f324e44', '066ccf28', '5570938d', '5d1fb8f6', '539ea09c',
'68f9f235', '8f7b0c49', 'c7903591', 'dc8e9504', '54e9eebc',
'778abd5a', '99691fc6', '7da49e85', '0f343345', '876fb9e6'], dtype='<U8')
"""
return np.array([CTImagesMaskedBatch.make_filename() for i in range(size)])
def __init__(self, index, *args, **kwargs):
""" Execute Batch construction and init of basic attributes
Parameters
----------
index : Dataset.Index class.
Required indexing of objects (files).
"""
super().__init__(index, *args, **kwargs)
self.masks = None
self.nodules = None
[docs] def nodules_to_df(self, nodules):
""" Convert nodules_info ndarray into pandas dataframe.
Pandas DataFrame will contain following columns:
'source_id' - id of source element of batch;
'nodule_id' - generated id for nodules;
'locZ', 'locY', 'locX' - coordinates of nodules' centers;
'diamZ', 'diamY', 'diamX' - sizes of nodules along zyx axes;
Parameters
----------
nodules : ndarray of type nodules_info
nodules_info type is defined inside of CTImagesMaskedBatch class.
Returns
-------
pd.DataFrame
centers, ids and sizes of nodules.
"""
columns = ['nodule_id', 'source_id', 'locZ', 'locY',
'locX', 'diamZ', 'diamY', 'diamX']
nodule_id = self.make_indices(nodules.shape[0])
return pd.DataFrame({'source_id': self.indices[nodules.patient_pos],
'nodule_id': nodule_id,
'locZ': nodules.nodule_center[:, 0],
'locY': nodules.nodule_center[:, 1],
'locX': nodules.nodule_center[:, 2],
'diamZ': nodules.nodule_size[:, 0],
'diamY': nodules.nodule_size[:, 1],
'diamX': nodules.nodule_size[:, 2]}, columns=columns)
[docs] def get_pos(self, data, component, index):
""" Return a positon of an item for a given index in data
or in self.`component`.
Fetch correct position inside batch for an item, looks for it
in `data`, if provided, or in `component` in self.
Parameters
----------
data : None or ndarray
data from which subsetting is done.
If None, retrieve position from `component` of batch,
if ndarray, returns index.
component : str
name of a component, f.ex. 'images'.
if component provided, data should be None.
index : str or int
index of an item to be looked for.
may be key from dataset (str)
or index inside batch (int).
Returns
-------
int
Position of item
Notes
-----
This is an overload of get_pos from base Batch-class,
see corresponding docstring for detailed explanation.
"""
if data is None:
ind_pos = self._get_verified_pos(index)
if component in ['images', 'masks']:
return slice(self.lower_bounds[ind_pos], self.upper_bounds[ind_pos])
else:
return slice(ind_pos, ind_pos + 1)
else:
return index
@property
def num_nodules(self):
""" Get number of nodules in CTImagesMaskedBatch.
Returns
-------
int
number of nodules in CTImagesMaskedBatch.
if fetch_nodules_info method has not been called yet returns 0.
"""
if self.nodules is not None:
return self.nodules.patient_pos.shape[0]
else:
return 0
[docs] @action
def fetch_nodules_info(self, nodules=None, nodules_records=None, update=False, images_loaded=True):
"""Extract nodules' info from nodules into attribute self.nodules.
Parameters
----------
nodules : pd.DataFrame
contains:
- 'seriesuid': index of patient or series.
- 'coordZ','coordY','coordX': coordinates of nodules center.
- 'diameter_mm': diameter, in mm.
nodules_records : np.recarray
if not None, should
contain the same fields as describe in Note.
update : bool
if False, warning appears to remind that nodules info
will be earased and recomputed.
images_loaded : bool
if True, i.e. `images` component is loaded,
and image_size is used to compute
correct nodules location inside `skyscraper`.
If False, it doesn't update info of location
inside `skyscraper`.
Returns
-------
batch
Notes
-----
Run this action only after :func:`~radio.CTImagesBatch.load`.
The method fills in record array self.nodules that contains the following information about nodules:
- self.nodules.nodule_center -- ndarray(num_nodules, 3) centers of
nodules in world coords;
- self.nodules.nodule_size -- ndarray(num_nodules, 3) sizes of
nodules along z, y, x in world coord;
- self.nodules.img_size -- ndarray(num_nodules, 3) sizes of images of
patient data corresponding to nodules;
- self.nodules.offset -- ndarray(num_nodules, 3) of biases of
patients which correspond to nodules;
- self.nodules.spacing -- ndarray(num_nodules, 3) of spacinf attribute
of patients which correspond to nodules;
- self.nodules.origin -- ndarray(num_nodules, 3) of origin attribute
of patients which correspond to nodules.
- self.nodules.patient_pos -- ndarray(num_nodules, 1) refers to
positions of patients which correspond to stored nodules.
"""
if self.nodules is not None and not update:
logger.warning("Nodules have already been extracted. " +
"Put update argument as True for refreshing")
return self
if nodules_records is not None:
# load from record-array
self.nodules = nodules_records
else:
# assume that nodules is supplied and load from it
required_columns = np.array(['seriesuid', 'diameter_mm',
'coordZ', 'coordY', 'coordX'])
if not (isinstance(nodules, pd.DataFrame) and np.all(np.in1d(required_columns, nodules.columns))):
raise ValueError(("Argument 'nodules' must be pandas DataFrame"
+ " with {} columns. Make sure that data provided"
+ " in correct format.").format(required_columns.tolist()))
nodules_df = nodules.set_index('seriesuid')
unique_indices = nodules_df.index.unique()
inter_index = np.intersect1d(unique_indices, self.indices)
nodules_df = nodules_df.loc[inter_index,
["coordZ", "coordY",
"coordX", "diameter_mm"]]
num_nodules = nodules_df.shape[0]
self.nodules = np.rec.array(np.zeros(num_nodules,
dtype=self.nodules_dtype))
counter = 0
for pat_id, coordz, coordy, coordx, diam in nodules_df.itertuples():
pat_pos = self.index.get_pos(pat_id)
self.nodules.patient_pos[counter] = pat_pos
self.nodules.nodule_center[counter, :] = np.array([coordz,
coordy,
coordx])
self.nodules.nodule_size[counter, :] = np.array([diam, diam, diam])
counter += 1
self._refresh_nodules_info(images_loaded)
return self
[docs] @action
def fetch_nodules_from_mask(self, images_loaded=True):
""" Fetch nodules info (centers and sizes) from masks.
Runs skimage.measure.labels for fetching nodules regions
from masks. Extracts nodules info from segmented regions
and put this information in self.nodules np.recarray.
Parameters
----------
images_loaded : bool
if True, i.e. `images` component is loaded,
and image_size is used to compute
correct nodules location inside `skyscraper`.
If False, it doesn't update info of location
inside `skyscraper`.
Returns
-------
batch
Notes
-----
Sizes along [zyx] will be the same.
"""
nodules_list = []
for pos in range(len(self)):
mask = self.get(pos, 'masks')
mask_labels = measure.label(mask, background=0)
for props in measure.regionprops(np.int16(mask_labels)):
center = np.asarray((props.centroid[0],
props.centroid[1],
props.centroid[2]), dtype=np.float)
center = center * self.spacing[pos] + self.origin[pos]
diameter = np.asarray(
[props.equivalent_diameter] * 3, dtype=np.float)
diameter = diameter * self.spacing[pos]
nodules_list.append({'patient_pos': pos,
'nodule_center': center,
'nodule_size': diameter})
num_nodules = len(nodules_list)
self.nodules = np.rec.array(
np.zeros(num_nodules, dtype=self.nodules_dtype))
for i, nodule in enumerate(nodules_list):
self.nodules.patient_pos[i] = nodule['patient_pos']
self.nodules.nodule_center[i, :] = nodule['nodule_center']
self.nodules.nodule_size[i, :] = nodule['nodule_size']
self._refresh_nodules_info(images_loaded)
return self
# TODO: another name of method
def _fit_into_bounds(self, size, variance=None):
""" Fetch start voxel coordinates of all nodules.
Get start voxel coordinates of all nodules in batch.
Note that all nodules are considered to have
fixed same size defined by argument size: if nodule is out of
patient's 3d image bounds than it's center is shifted to border.
Parameters
----------
size : list or tuple of ndarrays
ndarray(3, ) with diameters of nodules in (z,y,x).
variance : ndarray(3, )
diagonal elements of multivariate normal distribution,
for sampling random shifts along (z,y,x) correspondingly.
Returns
-------
ndarray
start coordinates (z,y,x) of all nodules in batch.
"""
size = np.array(size, dtype=np.int)
center_pix = np.abs(self.nodules.nodule_center -
self.nodules.origin) / self.nodules.spacing
start_pix = (np.rint(center_pix) - np.rint(size / 2))
if variance is not None:
start_pix += np.random.multivariate_normal(np.zeros(3),
np.diag(variance),
self.nodules.patient_pos.shape[0])
end_pix = start_pix + size
bias_upper = np.maximum(end_pix - self.nodules.img_size, 0)
start_pix -= bias_upper
end_pix -= bias_upper
bias_lower = np.maximum(-start_pix, 0)
start_pix += bias_lower
end_pix += bias_lower
return (start_pix + self.nodules.offset).astype(np.int)
[docs] @action
def create_mask(self):
""" Create `masks` component from `nodules` component.
Notes
-----
`nodules` must be not None before calling this method.
see :func:`~radio.preprocessing.ct_masked_batch.CTImagesMaskedBatch.fetch_nodules_info`
for more details.
"""
if self.nodules is None:
logger.warning("Info about nodules location must " +
"be loaded before calling this method. " +
"Nothing happened.")
self.masks = np.zeros_like(self.images)
center_pix = np.abs(self.nodules.nodule_center -
self.nodules.origin) / self.nodules.spacing
start_pix = (center_pix - np.rint(self.nodules.nodule_size /
self.nodules.spacing / 2))
start_pix = np.rint(start_pix).astype(np.int)
make_mask_numba(self.masks, self.nodules.offset,
self.nodules.img_size + self.nodules.offset, start_pix,
np.rint(self.nodules.nodule_size / self.nodules.spacing))
return self
[docs] def fetch_mask(self, shape):
""" Create `masks` component of different size then `images`,
using `nodules` component.
Parameters
----------
shape : tuple, list or ndarray of int.
(z_dim,y_dim,x_dim), shape of mask to be created.
Returns
-------
ndarray
3d array with masks in form of `skyscraper`.
# TODO: one part of code from here repeats create_mask function
better to unify these two func
"""
if self.nodules is None:
logger.warning("Info about nodules location must " +
"be loaded before calling this method. " +
"Nothing happened.")
mask = np.zeros(shape=(len(self) * shape[0], *shape[1:]))
# infer scale factor; assume patients are already resized to equal
# shapes
scale_factor = np.asarray(shape) / self.images_shape[0, :]
# get rescaled nodule-centers, nodule-sizes, offsets, locs of nod
# starts
center_scaled = (np.abs(self.nodules.nodule_center - self.nodules.origin) /
self.nodules.spacing * scale_factor)
start_scaled = (center_scaled - scale_factor * self.nodules.nodule_size /
self.nodules.spacing / 2)
start_scaled = np.rint(start_scaled).astype(np.int)
offset_scaled = np.rint(self.nodules.offset *
scale_factor).astype(np.int)
img_size_scaled = np.rint(
self.nodules.img_size * scale_factor).astype(np.int)
nod_size_scaled = (np.rint(scale_factor * self.nodules.nodule_size /
self.nodules.spacing)).astype(np.int)
# put nodules into mask
make_mask_numba(mask, offset_scaled, img_size_scaled + offset_scaled,
start_scaled, nod_size_scaled)
# return ndarray-mask
return mask
# TODO rename function to sample_random_nodules_positions
[docs] def sample_random_nodules(self, num_nodules, nodule_size, histo=None):
""" Sample random nodules positions in CTImagesBatchMasked.
Samples random nodules positions in ndarray. Each nodule have shape
defined by `nodule_size`. If size of patients' data along z-axis
is not the same for different patients, NotImplementedError will be raised.
Parameters
----------
num_nodules : int
number of nodules to sample from dataset.
nodule_size : ndarray(3, )
crop shape along (z,y,x).
histo : tuple
np.histogram()'s output.
3d-histogram, represented by tuple (bins, edges).
Returns
-------
ndarray
ndarray(num_nodules, 3). 1st array's dim is an index of sampled
nodules, 2nd points out start positions (integers) of nodules
in batch `skyscraper`.
"""
all_indices = np.arange(len(self))
sampled_indices = np.random.choice(
all_indices, num_nodules, replace=True)
offset = np.zeros((num_nodules, 3))
offset[:, 0] = self.lower_bounds[sampled_indices]
data_shape = self.images_shape[sampled_indices, :]
# if supplied, use histogram as the sampler
if histo is None:
sampler = lambda size: np.random.rand(size, 3)
else:
sampler = lambda size: sample_histo3d(histo, size)
samples = sampler(size=num_nodules) * (data_shape - nodule_size)
if histo is not None:
samples /= data_shape
return np.asarray(samples + offset, dtype=np.int), sampled_indices
[docs] @action
def sample_nodules(self, batch_size, nodule_size=(32, 64, 64), share=0.8, variance=None, # pylint: disable=too-many-locals, too-many-statements
mask_shape=None, histo=None):
""" Sample random crops of `images` and `masks` from batch.
Create random crops, both with and without nodules in it, from input batch.
Parameters
----------
batch_size : int
number of nodules in the output batch. Required,
if share=0.0. If None, resulting batch will include all
cancerous nodules.
nodule_size : tuple, list or ndarray of int
crop shape along (z,y,x).
share : float
share of cancer crops in the batch.
if input CTImagesBatch contains less cancer
nodules than needed random nodules will be taken.
variance : tuple, list or ndarray of float
variances of normally distributed random shifts of
nodules' start positions.
mask_shape : tuple, list or ndarray of int
size of `masks` crop in (z,y,x)-order. If not None,
crops with masks would be of mask_shape.
If None, mask crop shape would be equal to crop_size.
histo : tuple
np.histogram()'s output.
Used for sampling non-cancerous crops.
Returns
-------
Batch
batch with cancerous and non-cancerous crops in a proportion defined by
`share` with total `batch_size` nodules. If `share` == 1.0, `batch_size`
is None, resulting batch consists of all cancerous crops stored in batch.
"""
# make sure that nodules' info is fetched and args are OK
if self.nodules is None:
raise AttributeError("Info about nodules location must " +
"be loaded before calling this method")
if variance is not None:
variance = np.asarray(variance, dtype=np.int)
variance = variance.flatten()
if len(variance) != 3:
logger.warning('Argument variance be np.array-like' +
'and has shape (3,). ' +
'Would be used no-scale-shift.')
variance = None
if share == 0.0 and batch_size is None:
raise ValueError('Either supply batch_size or set share to positive number')
# pos of batch-items that correspond to crops
crops_indices = np.zeros(0, dtype=np.int16)
# infer the number of cancerous nodules and the size of batch
batch_size = batch_size if batch_size is not None else 1.0 / share * self.num_nodules
cancer_n = int(share * batch_size)
batch_size = int(batch_size)
cancer_n = self.num_nodules if cancer_n > self.num_nodules else cancer_n
if batch_size == 0:
raise SkipBatchException('Batch of zero size cannot be passed further through the workflow')
# choose cancerous nodules' starting positions
nodule_size = np.asarray(nodule_size, dtype=np.int)
if self.num_nodules == 0:
cancer_nodules = np.zeros((0, 3))
else:
# adjust cancer nodules' starting positions s.t. nodules fit into
# scan-boxes
cancer_nodules = self._fit_into_bounds(
nodule_size, variance=variance)
# randomly select needed number of cancer nodules (their starting
# positions)
sample_indices = np.random.choice(np.arange(self.num_nodules),
size=cancer_n, replace=False)
cancer_nodules = cancer_nodules[sample_indices, :]
# store scans-indices for chosen crops
cancerous_indices = self.nodules.patient_pos[sample_indices].reshape(-1)
crops_indices = np.concatenate([crops_indices, cancerous_indices])
nodules_st_pos = cancer_nodules
# if non-cancerous nodules are needed, add random starting pos
if batch_size - cancer_n > 0:
# sample starting positions for (most-likely) non-cancerous crops
random_nodules, random_indices = self.sample_random_nodules(batch_size - cancer_n,
nodule_size, histo=histo)
# concat non-cancerous and cancerous crops' starting positions
nodules_st_pos = np.vstack([nodules_st_pos, random_nodules]).astype(
np.int) # pylint: disable=no-member
# store scan-indices for randomly chose crops
crops_indices = np.concatenate([crops_indices, random_indices])
# obtain nodules' scans by cropping from self.images
images = get_nodules_numba(self.images, nodules_st_pos, nodule_size)
# if mask_shape not None, compute scaled mask for the whole batch
# scale also nodules' starting positions and nodules' shapes
if mask_shape is not None:
scale_factor = np.asarray(mask_shape) / np.asarray(nodule_size)
batch_mask_shape = np.rint(
scale_factor * self.images_shape[0, :]).astype(np.int)
batch_mask = self.fetch_mask(batch_mask_shape)
nodules_st_pos = np.rint(
scale_factor * nodules_st_pos).astype(np.int)
else:
batch_mask = self.masks
mask_shape = nodule_size
# crop nodules' masks
masks = get_nodules_numba(batch_mask, nodules_st_pos, mask_shape)
# build nodules' batch
bounds = np.arange(batch_size + 1) * nodule_size[0]
crops_spacing = self.spacing[crops_indices]
offset = np.zeros((batch_size, 3))
offset[:, 0] = self.lower_bounds[crops_indices]
crops_origin = self.origin[crops_indices] + crops_spacing * (nodules_st_pos - offset)
names_gen = zip(self.indices[crops_indices], self.make_indices(batch_size))
ix_batch = ['_'.join([prefix, random_str]) for prefix, random_str in names_gen]
nodules_batch = type(self)(DatasetIndex(ix_batch))
nodules_batch._init_data(images=images, bounds=bounds, spacing=crops_spacing, origin=crops_origin, masks=masks) # pylint: disable=protected-access
# set nodules info in nodules' batch
nodules_records = [self.nodules[self.nodules.patient_pos == crop_pos] for crop_pos in crops_indices]
new_patient_pos = []
for i, records in enumerate(nodules_records):
new_patient_pos += [i] * len(records)
new_patient_pos = np.array(new_patient_pos)
nodules_records = np.concatenate(nodules_records)
nodules_records = nodules_records.view(np.recarray)
nodules_records.patient_pos = new_patient_pos
nodules_batch.fetch_nodules_info(nodules_records=nodules_records)
# leave out nodules with zero-intersection with crops' boxes
nodules_batch._filter_nodules_info() # pylint: disable=protected-access
return nodules_batch
[docs] @action
def sample_dump(self, dst, n_iters, nodule_size=(32, 64, 64), batch_size=20, share=0.8, **kwargs):
""" Perform sample_nodules and dump on the same batch n_iters times.
Can be used for fast creation of large datasets of cancerous/non-cancerous crops.
Parameters
----------
dst : str
folder to dump nodules in.
n_iters : int
number of iterations to be performed.
nodule_size : tuple, list or ndarray of int
(z,y,x)-shape of sampled nodules.
batch_size : int or None
size of generated batches.
share : float
share of cancer nodules. See docstring of sample_nodules for more info
about possible combinations of parameters share and batch_size.
**kwargs : dict
additional arguments supplied into sample_nodules. See docstring
of sample_nodules for more info.
"""
for _ in range(n_iters):
nodules = self.sample_nodules(batch_size=batch_size, nodule_size=nodule_size, share=share, **kwargs)
nodules = nodules.dump(dst=dst)
return self
[docs] @action
def update_nodules_histo(self, histo):
""" Update histogram of nodules' locations using nodules locations from batch.
Parameters
----------
histo : list
list(np.histogram()), used for sampling cancerous locations.
Notes
-----
Execute action only after .fetch_nodules_info().
"""
# infer bins' bounds from histo
bins = histo[1]
# get cancer_nodules' centers in voxel coords
center_pix = np.abs(self.nodules.nodule_center -
self.nodules.origin) / self.nodules.spacing
# update bins of histo
histo_delta = np.histogramdd(center_pix, bins=bins)
histo[0] += histo_delta[0]
return self
[docs] def get_axial_slice(self, patient_pos, height):
""" Get tuple of `images` slice and `masks` slice by patient and slice position.
Parameters
----------
patient_pos : int
patient position in the batch
height : float
number of slice (z-axis), scaled to [0:1]
used to get slice with position:
int(height * number_of slices_for_patient) from
patient's scan and mask.
Returns
-------
tuple
(images_slice,masks_slice) by patient_pos and number of slice
"""
margin = int(height * self.get(patient_pos, 'images').shape[0])
if self.masks is not None:
patch = (self.get(patient_pos, 'images')[margin, :, :],
self.get(patient_pos, 'masks')[margin, :, :])
else:
patch = (self.get(patient_pos, 'images')[margin, :, :], None)
return patch
def _refresh_nodules_info(self, images_loaded=True):
""" Refresh self.nodules attributes [spacing, origin, img_size, bias].
This method is called to update [spacing, origin, img_size, bias]
attributes of self.nodules because batch's inner data has changed,
e.g. after resize.
Parameters
----------
images_loaded : bool
if True, assumes that `_bounds` attribute is computed,
i.e. either `masks` and/or `images` are loaded.
"""
if images_loaded:
self.nodules.offset[:, 0] = self.lower_bounds[
self.nodules.patient_pos]
self.nodules.img_size = self.images_shape[
self.nodules.patient_pos, :]
self.nodules.spacing = self.spacing[self.nodules.patient_pos, :]
self.nodules.origin = self.origin[self.nodules.patient_pos, :]
def _filter_nodules_info(self):
""" Filter record-array self.nodules s.t. only records about cancerous nodules
that have non-zero intersection with scan-boxes be present.
Notes
-----
can be called only after execution of fetch_nodules_info and _refresh_nodules_info
"""
# nodules start and trailing pixel-coords
center_pix = (self.nodules.nodule_center - self.nodules.origin) / self.nodules.spacing
start_pix = center_pix - np.rint(self.nodules.nodule_size / self.nodules.spacing / 2)
start_pix = np.rint(start_pix).astype(np.int)
end_pix = start_pix + np.rint(self.nodules.nodule_size / self.nodules.spacing)
# find nodules with no intersection with scan-boxes
nods_images_shape = self.images_shape[self.nodules.patient_pos]
start_mask = np.any(start_pix >= nods_images_shape, axis=1)
end_mask = np.any(end_pix <= 0, axis=1)
zero_mask = start_mask | end_mask
# filter out such nodules
self.nodules = self.nodules[~zero_mask]
def _rescale_spacing(self):
""" Rescale spacing values and call _refresh_nodules_info().
Method is called after any operation that changes shape of inner data.
"""
if self.nodules is not None:
self._refresh_nodules_info()
return self
def _post_mask(self, list_of_arrs, **kwargs):
""" Concatenate outputs of different workers and put the result in `masks`
Parameters
----------
list_of_arrs : list
list of ndarrays of patients' masks.
"""
self._reraise_worker_exceptions(list_of_arrs)
new_masks = np.concatenate(list_of_arrs, axis=0)
self.masks = new_masks
return self
def _init_load_blosc(self, **kwargs):
""" Init-func for load from blosc.
Fills images/masks-components with zeroes if the components are to be updated.
Parameters
----------
**kwargs
components : str, list or tuple
iterable of components names that need to be loaded
Returns
-------
list
list of ids of batch-items, i.e. series ids or patient ids.
"""
# fill 'images', 'masks'-comps with zeroes if needed
skysc_components = {'images', 'masks'} & set(kwargs['components'])
self._prealloc_skyscraper_components(skysc_components)
return self.indices
def _post_rebuild(self, all_outputs, new_batch=False, **kwargs):
""" Gather outputs of different workers, rebuild `images` and `masks`.
Parameters
----------
all_outputs : list
list of outputs. Each item is given by tuple.
new_batch : bool
if True, returns new batch with data agregated
from all_ouputs. if False, changes self.
**kwargs
shape : list, tuple or ndarray of int
(z,y,x)-shape of every image in image component after action is performed.
spacing : tuple, list or ndarray of float
(z,y,x)-spacing for each image. If supplied, assume that
unify_spacing is performed.
Returns
-------
batch
"""
# TODO: process errors
batch = super()._post_rebuild(all_outputs, new_batch, **kwargs)
batch.nodules = self.nodules
batch._rescale_spacing() # pylint: disable=protected-access
if self.masks is not None:
batch.create_mask()
return batch
[docs] @action
def make_xip(self, depth, stride=1, mode='max', projection='axial', padding='reflect', **kwargs):
""" Make intensity projection (maximum, minimum, mean or median).
Notice that axis is chosen according to projection argument.
Parameters
----------
depth : int
number of slices over which xip operation is performed.
stride : int
stride-step along projection dimension.
mode : str
Possible values are 'max', 'min', 'mean' or 'median'.
projection : str
Possible values: 'axial', 'coronal', 'sagital'.
In case of 'coronal' and 'sagital' projections tensor
will be transposed from [z,y,x] to [x,z,y] and [y,z,x].
padding : str
mode of padding that will be passed in numpy.padding function.
"""
if projection == 'axial':
_projection = 0
elif projection == 'coronal':
_projection = 1
elif projection == 'sagital':
_projection = 2
batch = super().make_xip(stride=stride, depth=depth, mode=mode,
projection=projection, padding=padding, **kwargs)
if self.nodules is not None:
projection_spacing = self.nodules.spacing[:, _projection]
batch.nodules = self.nodules
batch.nodules.nodule_size[:, _projection] += (depth * projection_spacing) # pylint: disable=unsubscriptable-object
batch._rescale_spacing() # pylint: disable=protected-access
if self.masks is not None:
batch.create_mask()
return batch
[docs] @action
def central_crop(self, crop_size, crop_mask=False, **kwargs):
""" Make crop of crop_size from center of images.
Parameters
----------
crop_size : tuple, list or ndarray of int
(z,y,x)-shape of central crop along three axes(z,y,x order is used).
crop_mask : bool
if True, crop the mask in the same way.
Returns
-------
batch
"""
crop_size = np.asarray(crop_size).reshape(-1)
crop_halfsize = np.rint(crop_size / 2)
img_shapes = [np.asarray(self.get(i, 'images').shape)
for i in range(len(self))]
if any(np.any(shape < crop_size) for shape in img_shapes):
raise ValueError(
"Crop size must be smaller than size of inner 3D images")
cropped_images = []
cropped_masks = []
for i in range(len(self)):
image = self.get(i, 'images')
cropped_images.append(make_central_crop(image, crop_size))
if crop_mask and self.masks is not None:
mask = self.get(i, 'masks')
cropped_masks.append(make_central_crop(mask, crop_size))
self._bounds = np.cumsum([0] + [crop_size[0]] * len(self))
self.images = np.concatenate(cropped_images, axis=0)
if crop_mask and self.masks is not None:
self.masks = np.concatenate(cropped_masks, axis=0)
# recalculate origin, refresh nodules_info, leave only relevant nodules
self.origin = self.origin + self.spacing * crop_halfsize
if self.nodules is not None:
self._refresh_nodules_info()
self._filter_nodules_info()
return self
[docs] def flip(self): # pylint: disable=arguments-differ
""" Invert the order of slices for each patient
Returns
-------
batch
Examples
--------
>>> batch = batch.flip()
"""
logger.warning("There is no implementation of flip method for class " +
"CTIMagesMaskedBatch. Nothing happened")
return self
[docs] @action
def binarize_mask(self, threshold=0.35):
""" Binarize masks by threshold.
Parameters
----------
threshold : float
threshold for masks binarization.
"""
self.masks *= np.asarray(self.masks > threshold, dtype=np.int)
return self
[docs] @action
def predict_on_scan(self, model_name, strides=(16, 32, 32), crop_shape=(32, 64, 64),
batch_size=4, targets_mode='segmentation', data_format='channels_last',
show_progress=True, model_type='tf'):
""" Get predictions of the model on data contained in batch.
Transforms scan data into patches of shape CROP_SHAPE and then feed
this patches sequentially into model with name specified by
argument 'model_name'; after that loads predicted masks or probabilities
into 'masks' component of the current batch and returns it.
Parameters
----------
model_name : str
name of model that will be used for predictions.
strides : tuple, list or ndarray of int
(z,y,x)-strides for patching operation.
crop_shape : tuple, list or ndarray of int
(z,y,x)-shape of crops.
batch_size : int
number of patches to feed in model in one iteration.
targets_mode: str
type of targets 'segmentation', 'regression' or 'classification'.
data_format: str
format of neural network input data,
can be 'channels_first' or 'channels_last'.
model_type : str
represents type of model that will be used for prediction.
Possible values are 'keras' or 'tf'.
Returns
-------
CTImagesMaskedBatch.
"""
_model = self.get_model_by_name(model_name)
crop_shape = np.asarray(crop_shape).reshape(-1)
strides = np.asarray(strides).reshape(-1)
patches_arr = self.get_patches(patch_shape=crop_shape,
stride=strides,
padding='reflect')
if data_format == 'channels_first':
patches_arr = patches_arr[:, np.newaxis, ...]
elif data_format == 'channels_last':
patches_arr = patches_arr[..., np.newaxis]
predictions = []
iterations = range(0, patches_arr.shape[0], batch_size)
if show_progress:
iterations = tqdm_notebook(iterations) # pylint: disable=redefined-variable-type
for i in iterations:
if model_type == 'tf':
_prediction = _model.predict(feed_dict={'images': patches_arr[i: i + batch_size, ...]})
else:
_prediction = _model.predict(patches_arr[i: i + batch_size, ...])
current_prediction = np.asarray(_prediction)
if targets_mode == 'classification':
current_prediction = np.stack([np.ones(shape=(crop_shape)) * prob
for prob in current_prediction.ravel()])
if targets_mode == 'regression':
current_prediction = create_mask_reg(current_prediction[:, :3],
current_prediction[:, 3:6],
current_prediction[:, 6],
crop_shape, 0.01)
predictions.append(current_prediction)
patches_mask = np.concatenate(predictions, axis=0)
patches_mask = np.squeeze(patches_mask)
self.load_from_patches(patches_mask, stride=strides,
scan_shape=tuple(self.images_shape[0, :]),
data_attr='masks')
return self
[docs] def unpack(self, component='images', **kwargs):
""" Basic way for unpacking components from batch.
Parameters
----------
component : str
component to unpack, can be 'images' or 'masks'.
data_format : str
can be 'channels_last' or 'channels_first'. Reflects where to put
channels dimension: right after batch dimension or after all spatial axes.
kwargs : dict
key-word arguments that will be passed in callable if
component argument reffers to method of batch class.
Returns
-------
ndarray(batch_size, ...) or None
"""
if not hasattr(self, component):
return None
if component in ('images', 'masks'):
data_format = kwargs.get('data_format', 'channels_last')
if np.all(self.images_shape == self.images_shape[0, :]):
value = self.get(None, component).reshape(-1, *self.images_shape[0, :])
else:
value = np.stack([self.get(i, component) for i in range(len(self))])
if data_format == 'channels_last':
value = value[..., np.newaxis]
elif data_format == 'channels_first':
value = value[:, np.newaxis, ...]
else:
attr_value = getattr(self, component)
if callable(attr_value):
value = attr_value(**kwargs)
else:
value = attr_value
return value
[docs] def classification_targets(self, threshold=10, **kwargs):
""" Unpack data from batch in format suitable for classification task.
Parameters
----------
threshold : int
minimum number of '1' pixels in mask to consider it cancerous.
Returns
-------
ndarray(batch_size, 1)
targets for classification task: labels corresponding to cancerous
nodules ('1') and non-cancerous nodules ('0').
"""
masks_labels = np.asarray([self.get(i, 'masks').sum() > threshold
for i in range(len(self))], dtype=np.int)
return masks_labels[..., np.newaxis]
[docs] def regression_targets(self, threshold=10, **kwargs):
""" Unpack data from batch in format suitable for regression task.
Parameters
----------
threshold : int
minimum number of '1' pixels in mask to consider it cancerous.
Returns
-------
ndarray(batch_size, 7)
targets for regression task: cancer center, size
and label(1 for cancerous and 0 for non-cancerous). Note that in case
of non-cancerous crop first 6 column of output array will be set to zero.
"""
nodules = self.nodules
sizes = np.zeros(shape=(len(self), 3), dtype=np.float)
centers = np.zeros(shape=(len(self), 3), dtype=np.float)
for item_pos, _ in enumerate(self.indices):
item_nodules = nodules[nodules.patient_pos == item_pos]
if len(item_nodules) == 0:
continue
mask_nod_indices = item_nodules.nodule_size.max(axis=1).argmax()
nodule_sizes = (item_nodules.nodule_size / self.spacing[item_pos, :]
/ self.images_shape[item_pos, :])
nodule_centers = (item_nodules.nodule_center / self.spacing[item_pos, :]
/ self.images_shape[item_pos, :])
sizes[item_pos, :] = nodule_sizes[mask_nod_indices, :]
centers[item_pos, :] = nodule_centers[mask_nod_indices, :]
labels = self.unpack('classification_targets', threshold=threshold)
reg_targets = np.concatenate([centers, sizes, labels], axis=1)
return reg_targets
[docs] def segmentation_targets(self, data_format='channels_last', **kwargs):
""" Unpack data from batch in format suitable for regression task.
Parameters
----------
data_format : str
data_format shows where to put new axis for channels dimension:
can be 'channels_last' or 'channels_first'.
Returns
-------
ndarray(batch_size, ...)
batch array with masks.
"""
return self.unpack('masks', data_format=data_format)
[docs] @staticmethod
def make_data_tf(batch, model=None, mode='segmentation', is_training=True, **kwargs):
""" Prepare data in batch for training neural network implemented in tensorflow.
Parameters
----------
mode : str
mode can be one of following 'classification', 'regression'
or 'segmentation'. Default is 'segmentation'.
data_format : str
data format batch data. Can be 'channels_last'
or 'channels_first'. Default is 'channels_last'.
is_training : bool
whether model is in training or prediction mode. Default is True.
threshold : int
threshold value of '1' pixels in masks to consider it cancerous.
Default is 10.
Returns
-------
dict or None
feed dict and fetches for training neural network.
"""
inputs = batch.unpack('images', **kwargs)
if mode in ['segmentation', 'classification', 'regression']:
labels = batch.unpack(mode + '_targets', **kwargs)
else:
raise ValueError("Argument 'mode' must have one of values: "
+ "'segmentation', 'classification' or 'regression'")
feed_dict = dict(images=inputs, labels=labels) if is_training else dict(images=inputs)
return dict(feed_dict=feed_dict, fetches=None)
[docs] @staticmethod
def make_data_keras(batch, model=None, mode='segmentation', is_training=True, **kwargs):
""" Prepare data in batch for training neural network implemented in keras.
Parameters
----------
mode : str
mode can be one of following 'classification', 'regression'
or 'segmentation'. Default is 'segmentation'.
data_format : str
data format batch data. Can be 'channels_last'
or 'channels_first'. Default is 'channels_last'.
is_training : bool
whether model is in training or prediction mode. Default is True.
threshold : int
threshold value of '1' pixels in masks to consider it cancerous.
Default is 10.
Returns
-------
dict or None
kwargs for keras model train method:
{'x': ndarray(...), 'y': ndarrray(...)} for training neural network.
"""
inputs = batch.unpack('images', **kwargs)
if mode in ['segmentation', 'classification', 'regression']:
labels = batch.unpack(mode + '_targets', **kwargs)
else:
raise ValueError("Argument 'mode' must have one of values: "
+ "'segmentation', 'classification' or 'regression'")
return dict(x=inputs, y=labels) if is_training else dict(x=inputs)