synapse_net.file_utils
1import os 2from typing import Dict, List, Optional, Tuple, Union 3 4import mrcfile 5import numpy as np 6import pooch 7 8try: 9 import cryoet_data_portal as cdp 10except ImportError: 11 cdp = None 12 13try: 14 import zarr 15except ImportError: 16 zarr = None 17 18try: 19 import s3fs 20except ImportError: 21 s3fs = None 22 23 24def get_cache_dir() -> str: 25 """Get the cache directory of synapse net. 26 27 Returns: 28 The cache directory. 29 """ 30 cache_dir = os.path.expanduser(pooch.os_cache("synapse-net")) 31 return cache_dir 32 33 34def get_data_path(folder: str, n_tomograms: Optional[int] = 1) -> Union[str, List[str]]: 35 """Get the path to all tomograms stored as .rec or .mrc files in a folder. 36 37 Args: 38 folder: The folder with tomograms. 39 n_tomograms: The expected number of tomograms. 40 41 Returns: 42 The filepath or list of filepaths of the tomograms in the folder. 43 """ 44 file_names = os.listdir(folder) 45 tomograms = [] 46 for fname in file_names: 47 ext = os.path.splitext(fname)[1] 48 if ext in (".rec", ".mrc"): 49 tomograms.append(os.path.join(folder, fname)) 50 51 if n_tomograms is None: 52 return tomograms 53 assert len(tomograms) == n_tomograms, f"{folder}: {len(tomograms)}, {n_tomograms}" 54 return tomograms[0] if n_tomograms == 1 else tomograms 55 56 57def _parse_voxel_size(voxel_size): 58 parsed_voxel_size = None 59 try: 60 # The voxel sizes are stored in Angsrrom in the MRC header, but we want them 61 # in nanometer. Hence we divide by a factor of 10 here. 62 parsed_voxel_size = { 63 "x": voxel_size.x / 10, 64 "y": voxel_size.y / 10, 65 "z": voxel_size.z / 10, 66 } 67 except Exception as e: 68 print(f"Failed to read voxel size: {e}") 69 return parsed_voxel_size 70 71 72def read_voxel_size(path: str) -> Dict[str, float] | None: 73 """Read voxel size from mrc/rec file. 74 75 The original unit of voxel size is Angstrom and we convert it to nanometers by dividing it by ten. 76 77 Args: 78 path: Path to mrc/rec file. 79 80 Returns: 81 Mapping from the axis name to voxel size. None if the voxel size could not be read. 82 """ 83 with mrcfile.open(path, permissive=True) as mrc: 84 voxel_size = _parse_voxel_size(mrc.voxel_size) 85 return voxel_size 86 87 88def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]: 89 """Read data and voxel size from mrc/rec file. 90 91 Args: 92 path: Path to mrc/rec file. 93 94 Returns: 95 The data read from the file. 96 The voxel size read from the file. 97 """ 98 with mrcfile.open(path, permissive=True) as mrc: 99 voxel_size = _parse_voxel_size(mrc.voxel_size) 100 data = np.asarray(mrc.data[:]) 101 assert data.ndim in (2, 3) 102 103 # Transpose the data to match python axis order. 104 data = np.flip(data, axis=1) if data.ndim == 3 else np.flip(data, axis=0) 105 return data, voxel_size 106 107 108def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]: 109 """Read data and voxel size from an ome.zarr file. 110 111 Args: 112 uri: Path or url to the ome.zarr file. 113 scale_level: The level of the multi-scale image pyramid to load. 114 fs: S3 filesystem to use for initializing the store. 115 116 Returns: 117 The data read from the file. 118 The voxel size read from the file. 119 """ 120 if zarr is None: 121 raise RuntimeError("The zarr library is required to read ome.zarr files.") 122 123 def parse_s3_uri(uri): 124 return uri.lstrip("s3://") 125 126 if uri.startswith("s3"): 127 if fs is None: 128 fs = s3fs.S3FileSystem(anon=True) 129 s3_uri = parse_s3_uri(uri) 130 store = s3fs.S3Map(root=s3_uri, s3=fs, check=False) 131 elif fs is not None: 132 s3_uri = parse_s3_uri(uri) 133 store = s3fs.S3Map(root=s3_uri, s3=fs, check=False) 134 else: 135 if not os.path.exists(uri): 136 raise ValueError(f"Cannot find the filepath at {uri}.") 137 store = uri 138 139 with zarr.open(store, "r") as f: 140 multiscales = f.attrs["multiscales"][0] 141 142 # Read the axis and transformation metadata for this dataset, to determine the voxel size. 143 axes = [axis["name"] for axis in multiscales["axes"]] 144 assert set(axes) == set("xyz") 145 units = [axis.get("unit", "angstrom") for axis in multiscales["axes"]] 146 assert all(unit in ("angstrom", "nanometer") for unit in units) 147 148 transformations = multiscales["datasets"][scale_level]["coordinateTransformations"] 149 scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0] 150 151 # Convert the given unit size to nanometer. 152 # (It is typically given in angstrom, and we have to divide by a factor of 10). 153 unit_factor = [10.0 if unit == "angstrom" else 1.0 for unit in units] 154 voxel_size = {axis: scale / factor for axis, scale, factor in zip(axes, scale_transformation, unit_factor)} 155 156 # Get the internale path for the given scale and load the data. 157 internal_path = multiscales["datasets"][scale_level]["path"] 158 data = f[internal_path][:] 159 160 return data, voxel_size 161 162 163def read_data_from_cryo_et_portal_run( 164 run_id: int, 165 output_path: Optional[str] = None, 166 use_zarr_format: bool = True, 167 processing_type: str = "denoised", 168 id_field: str = "run_id", 169 scale_level: Optional[int] = None, 170) -> Tuple[np.ndarray, Dict[str, float]]: 171 """Read data and voxel size from a CryoET Data Portal run. 172 173 Args: 174 run_id: The ID of the experiment run. 175 output_path: The path for saving the data. The data will be streamed if the path is not given. 176 use_zarr_format: Whether to use the data in zarr format instead of mrc. 177 processing_type: The processing type of the tomogram to download. 178 id_field: The name of the id field. One of 'id' or 'run_id'. 179 The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data. 180 scale_level: The scale level to read from the data. Only valid for zarr data. 181 182 Returns: 183 The data read from the run. 184 The voxel size read from the run. 185 """ 186 assert id_field in ("id", "run_id") 187 if output_path is not None and os.path.exists(output_path): 188 return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path) 189 190 if cdp is None: 191 raise RuntimeError("The CryoET data portal library is required to download data from the portal.") 192 if s3fs is None: 193 raise RuntimeError("The CryoET data portal download requires s3fs download.") 194 195 client = cdp.Client() 196 197 fs = s3fs.S3FileSystem(anon=True) 198 tomograms = cdp.Tomogram.find( 199 client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type] 200 ) 201 if len(tomograms) == 0: 202 return None, None 203 if len(tomograms) > 1: 204 raise NotImplementedError 205 tomo = tomograms[0] 206 207 if use_zarr_format: 208 if output_path is None: 209 scale_level = 0 if scale_level is None else scale_level 210 data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs) 211 else: 212 # TODO: write the outuput to ome zarr, for all scale levels. 213 raise NotImplementedError 214 else: 215 if scale_level is not None: 216 raise ValueError 217 if output_path is None: 218 raise RuntimeError("You have to pass an output_path to download the data as mrc file.") 219 fs.get(tomo.s3_mrc_file, output_path) 220 data, voxel_size = read_mrc(output_path) 221 222 return data, voxel_size
def
get_cache_dir() -> str:
25def get_cache_dir() -> str: 26 """Get the cache directory of synapse net. 27 28 Returns: 29 The cache directory. 30 """ 31 cache_dir = os.path.expanduser(pooch.os_cache("synapse-net")) 32 return cache_dir
Get the cache directory of synapse net.
Returns:
The cache directory.
def
get_data_path(folder: str, n_tomograms: Optional[int] = 1) -> Union[str, List[str]]:
35def get_data_path(folder: str, n_tomograms: Optional[int] = 1) -> Union[str, List[str]]: 36 """Get the path to all tomograms stored as .rec or .mrc files in a folder. 37 38 Args: 39 folder: The folder with tomograms. 40 n_tomograms: The expected number of tomograms. 41 42 Returns: 43 The filepath or list of filepaths of the tomograms in the folder. 44 """ 45 file_names = os.listdir(folder) 46 tomograms = [] 47 for fname in file_names: 48 ext = os.path.splitext(fname)[1] 49 if ext in (".rec", ".mrc"): 50 tomograms.append(os.path.join(folder, fname)) 51 52 if n_tomograms is None: 53 return tomograms 54 assert len(tomograms) == n_tomograms, f"{folder}: {len(tomograms)}, {n_tomograms}" 55 return tomograms[0] if n_tomograms == 1 else tomograms
Get the path to all tomograms stored as .rec or .mrc files in a folder.
Arguments:
- folder: The folder with tomograms.
- n_tomograms: The expected number of tomograms.
Returns:
The filepath or list of filepaths of the tomograms in the folder.
def
read_voxel_size(path: str) -> Optional[Dict[str, float]]:
73def read_voxel_size(path: str) -> Dict[str, float] | None: 74 """Read voxel size from mrc/rec file. 75 76 The original unit of voxel size is Angstrom and we convert it to nanometers by dividing it by ten. 77 78 Args: 79 path: Path to mrc/rec file. 80 81 Returns: 82 Mapping from the axis name to voxel size. None if the voxel size could not be read. 83 """ 84 with mrcfile.open(path, permissive=True) as mrc: 85 voxel_size = _parse_voxel_size(mrc.voxel_size) 86 return voxel_size
Read voxel size from mrc/rec file.
The original unit of voxel size is Angstrom and we convert it to nanometers by dividing it by ten.
Arguments:
- path: Path to mrc/rec file.
Returns:
Mapping from the axis name to voxel size. None if the voxel size could not be read.
def
read_mrc(path: str) -> Tuple[numpy.ndarray, Dict[str, float]]:
89def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]: 90 """Read data and voxel size from mrc/rec file. 91 92 Args: 93 path: Path to mrc/rec file. 94 95 Returns: 96 The data read from the file. 97 The voxel size read from the file. 98 """ 99 with mrcfile.open(path, permissive=True) as mrc: 100 voxel_size = _parse_voxel_size(mrc.voxel_size) 101 data = np.asarray(mrc.data[:]) 102 assert data.ndim in (2, 3) 103 104 # Transpose the data to match python axis order. 105 data = np.flip(data, axis=1) if data.ndim == 3 else np.flip(data, axis=0) 106 return data, voxel_size
Read data and voxel size from mrc/rec file.
Arguments:
- path: Path to mrc/rec file.
Returns:
The data read from the file. The voxel size read from the file.
def
read_ome_zarr( uri: str, scale_level: int = 0, fs=None) -> Tuple[numpy.ndarray, Dict[str, float]]:
109def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]: 110 """Read data and voxel size from an ome.zarr file. 111 112 Args: 113 uri: Path or url to the ome.zarr file. 114 scale_level: The level of the multi-scale image pyramid to load. 115 fs: S3 filesystem to use for initializing the store. 116 117 Returns: 118 The data read from the file. 119 The voxel size read from the file. 120 """ 121 if zarr is None: 122 raise RuntimeError("The zarr library is required to read ome.zarr files.") 123 124 def parse_s3_uri(uri): 125 return uri.lstrip("s3://") 126 127 if uri.startswith("s3"): 128 if fs is None: 129 fs = s3fs.S3FileSystem(anon=True) 130 s3_uri = parse_s3_uri(uri) 131 store = s3fs.S3Map(root=s3_uri, s3=fs, check=False) 132 elif fs is not None: 133 s3_uri = parse_s3_uri(uri) 134 store = s3fs.S3Map(root=s3_uri, s3=fs, check=False) 135 else: 136 if not os.path.exists(uri): 137 raise ValueError(f"Cannot find the filepath at {uri}.") 138 store = uri 139 140 with zarr.open(store, "r") as f: 141 multiscales = f.attrs["multiscales"][0] 142 143 # Read the axis and transformation metadata for this dataset, to determine the voxel size. 144 axes = [axis["name"] for axis in multiscales["axes"]] 145 assert set(axes) == set("xyz") 146 units = [axis.get("unit", "angstrom") for axis in multiscales["axes"]] 147 assert all(unit in ("angstrom", "nanometer") for unit in units) 148 149 transformations = multiscales["datasets"][scale_level]["coordinateTransformations"] 150 scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0] 151 152 # Convert the given unit size to nanometer. 153 # (It is typically given in angstrom, and we have to divide by a factor of 10). 154 unit_factor = [10.0 if unit == "angstrom" else 1.0 for unit in units] 155 voxel_size = {axis: scale / factor for axis, scale, factor in zip(axes, scale_transformation, unit_factor)} 156 157 # Get the internale path for the given scale and load the data. 158 internal_path = multiscales["datasets"][scale_level]["path"] 159 data = f[internal_path][:] 160 161 return data, voxel_size
Read data and voxel size from an ome.zarr file.
Arguments:
- uri: Path or url to the ome.zarr file.
- scale_level: The level of the multi-scale image pyramid to load.
- fs: S3 filesystem to use for initializing the store.
Returns:
The data read from the file. The voxel size read from the file.
def
read_data_from_cryo_et_portal_run( run_id: int, output_path: Optional[str] = None, use_zarr_format: bool = True, processing_type: str = 'denoised', id_field: str = 'run_id', scale_level: Optional[int] = None) -> Tuple[numpy.ndarray, Dict[str, float]]:
164def read_data_from_cryo_et_portal_run( 165 run_id: int, 166 output_path: Optional[str] = None, 167 use_zarr_format: bool = True, 168 processing_type: str = "denoised", 169 id_field: str = "run_id", 170 scale_level: Optional[int] = None, 171) -> Tuple[np.ndarray, Dict[str, float]]: 172 """Read data and voxel size from a CryoET Data Portal run. 173 174 Args: 175 run_id: The ID of the experiment run. 176 output_path: The path for saving the data. The data will be streamed if the path is not given. 177 use_zarr_format: Whether to use the data in zarr format instead of mrc. 178 processing_type: The processing type of the tomogram to download. 179 id_field: The name of the id field. One of 'id' or 'run_id'. 180 The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data. 181 scale_level: The scale level to read from the data. Only valid for zarr data. 182 183 Returns: 184 The data read from the run. 185 The voxel size read from the run. 186 """ 187 assert id_field in ("id", "run_id") 188 if output_path is not None and os.path.exists(output_path): 189 return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path) 190 191 if cdp is None: 192 raise RuntimeError("The CryoET data portal library is required to download data from the portal.") 193 if s3fs is None: 194 raise RuntimeError("The CryoET data portal download requires s3fs download.") 195 196 client = cdp.Client() 197 198 fs = s3fs.S3FileSystem(anon=True) 199 tomograms = cdp.Tomogram.find( 200 client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type] 201 ) 202 if len(tomograms) == 0: 203 return None, None 204 if len(tomograms) > 1: 205 raise NotImplementedError 206 tomo = tomograms[0] 207 208 if use_zarr_format: 209 if output_path is None: 210 scale_level = 0 if scale_level is None else scale_level 211 data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs) 212 else: 213 # TODO: write the outuput to ome zarr, for all scale levels. 214 raise NotImplementedError 215 else: 216 if scale_level is not None: 217 raise ValueError 218 if output_path is None: 219 raise RuntimeError("You have to pass an output_path to download the data as mrc file.") 220 fs.get(tomo.s3_mrc_file, output_path) 221 data, voxel_size = read_mrc(output_path) 222 223 return data, voxel_size
Read data and voxel size from a CryoET Data Portal run.
Arguments:
- run_id: The ID of the experiment run.
- output_path: The path for saving the data. The data will be streamed if the path is not given.
- use_zarr_format: Whether to use the data in zarr format instead of mrc.
- processing_type: The processing type of the tomogram to download.
- id_field: The name of the id field. One of 'id' or 'run_id'. The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data.
- scale_level: The scale level to read from the data. Only valid for zarr data.
Returns:
The data read from the run. The voxel size read from the run.