synapse_net.file_utils

  1import os
  2from typing import Dict, List, Optional, Tuple, Union
  3
  4import mrcfile
  5import numpy as np
  6import pooch
  7
  8try:
  9    import cryoet_data_portal as cdp
 10except ImportError:
 11    cdp = None
 12
 13try:
 14    import zarr
 15except ImportError:
 16    zarr = None
 17
 18try:
 19    import s3fs
 20except ImportError:
 21    s3fs = None
 22
 23
 24def get_cache_dir() -> str:
 25    """Get the cache directory of synapse net.
 26
 27    Returns:
 28        The cache directory.
 29    """
 30    cache_dir = os.path.expanduser(pooch.os_cache("synapse-net"))
 31    return cache_dir
 32
 33
 34def get_data_path(folder: str, n_tomograms: Optional[int] = 1) -> Union[str, List[str]]:
 35    """Get the path to all tomograms stored as .rec or .mrc files in a folder.
 36
 37    Args:
 38        folder: The folder with tomograms.
 39        n_tomograms: The expected number of tomograms.
 40
 41    Returns:
 42        The filepath or list of filepaths of the tomograms in the folder.
 43    """
 44    file_names = os.listdir(folder)
 45    tomograms = []
 46    for fname in file_names:
 47        ext = os.path.splitext(fname)[1]
 48        if ext in (".rec", ".mrc"):
 49            tomograms.append(os.path.join(folder, fname))
 50
 51    if n_tomograms is None:
 52        return tomograms
 53    assert len(tomograms) == n_tomograms, f"{folder}: {len(tomograms)}, {n_tomograms}"
 54    return tomograms[0] if n_tomograms == 1 else tomograms
 55
 56
 57def _parse_voxel_size(voxel_size):
 58    parsed_voxel_size = None
 59    try:
 60        # The voxel sizes are stored in Angsrrom in the MRC header, but we want them
 61        # in nanometer. Hence we divide by a factor of 10 here.
 62        parsed_voxel_size = {
 63            "x": voxel_size.x / 10,
 64            "y": voxel_size.y / 10,
 65            "z": voxel_size.z / 10,
 66        }
 67    except Exception as e:
 68        print(f"Failed to read voxel size: {e}")
 69    return parsed_voxel_size
 70
 71
 72def read_voxel_size(path: str) -> Dict[str, float] | None:
 73    """Read voxel size from mrc/rec file.
 74
 75    The original unit of voxel size is Angstrom and we convert it to nanometers by dividing it by ten.
 76
 77    Args:
 78        path: Path to mrc/rec file.
 79
 80    Returns:
 81        Mapping from the axis name to voxel size. None if the voxel size could not be read.
 82    """
 83    with mrcfile.open(path, permissive=True) as mrc:
 84        voxel_size = _parse_voxel_size(mrc.voxel_size)
 85    return voxel_size
 86
 87
 88def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]:
 89    """Read data and voxel size from mrc/rec file.
 90
 91    Args:
 92        path: Path to mrc/rec file.
 93
 94    Returns:
 95        The data read from the file.
 96        The voxel size read from the file.
 97    """
 98    with mrcfile.open(path, permissive=True) as mrc:
 99        voxel_size = _parse_voxel_size(mrc.voxel_size)
100        data = np.asarray(mrc.data[:])
101    assert data.ndim in (2, 3)
102
103    # Transpose the data to match python axis order.
104    data = np.flip(data, axis=1) if data.ndim == 3 else np.flip(data, axis=0)
105    return data, voxel_size
106
107
108def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]:
109    """Read data and voxel size from an ome.zarr file.
110
111    Args:
112        uri: Path or url to the ome.zarr file.
113        scale_level: The level of the multi-scale image pyramid to load.
114        fs: S3 filesystem to use for initializing the store.
115
116    Returns:
117        The data read from the file.
118        The voxel size read from the file.
119    """
120    if zarr is None:
121        raise RuntimeError("The zarr library is required to read ome.zarr files.")
122
123    def parse_s3_uri(uri):
124        return uri.lstrip("s3://")
125
126    if uri.startswith("s3"):
127        if fs is None:
128            fs = s3fs.S3FileSystem(anon=True)
129        s3_uri = parse_s3_uri(uri)
130        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
131    elif fs is not None:
132        s3_uri = parse_s3_uri(uri)
133        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
134    else:
135        if not os.path.exists(uri):
136            raise ValueError(f"Cannot find the filepath at {uri}.")
137        store = uri
138
139    with zarr.open(store, "r") as f:
140        multiscales = f.attrs["multiscales"][0]
141
142        # Read the axis and transformation metadata for this dataset, to determine the voxel size.
143        axes = [axis["name"] for axis in multiscales["axes"]]
144        assert set(axes) == set("xyz")
145        units = [axis.get("unit", "angstrom") for axis in multiscales["axes"]]
146        assert all(unit in ("angstrom", "nanometer") for unit in units)
147
148        transformations = multiscales["datasets"][scale_level]["coordinateTransformations"]
149        scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0]
150
151        # Convert the given unit size to nanometer.
152        # (It is typically given in angstrom, and we have to divide by a factor of 10).
153        unit_factor = [10.0 if unit == "angstrom" else 1.0 for unit in units]
154        voxel_size = {axis: scale / factor for axis, scale, factor in zip(axes, scale_transformation, unit_factor)}
155
156        # Get the internale path for the given scale and load the data.
157        internal_path = multiscales["datasets"][scale_level]["path"]
158        data = f[internal_path][:]
159
160    return data, voxel_size
161
162
163def read_data_from_cryo_et_portal_run(
164    run_id: int,
165    output_path: Optional[str] = None,
166    use_zarr_format: bool = True,
167    processing_type: str = "denoised",
168    id_field: str = "run_id",
169    scale_level: Optional[int] = None,
170) -> Tuple[np.ndarray, Dict[str, float]]:
171    """Read data and voxel size from a CryoET Data Portal run.
172
173    Args:
174        run_id: The ID of the experiment run.
175        output_path: The path for saving the data. The data will be streamed if the path is not given.
176        use_zarr_format: Whether to use the data in zarr format instead of mrc.
177        processing_type: The processing type of the tomogram to download.
178        id_field: The name of the id field. One of 'id' or 'run_id'.
179            The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data.
180        scale_level: The scale level to read from the data. Only valid for zarr data.
181
182    Returns:
183        The data read from the run.
184        The voxel size read from the run.
185    """
186    assert id_field in ("id", "run_id")
187    if output_path is not None and os.path.exists(output_path):
188        return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path)
189
190    if cdp is None:
191        raise RuntimeError("The CryoET data portal library is required to download data from the portal.")
192    if s3fs is None:
193        raise RuntimeError("The CryoET data portal download requires s3fs download.")
194
195    client = cdp.Client()
196
197    fs = s3fs.S3FileSystem(anon=True)
198    tomograms = cdp.Tomogram.find(
199        client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type]
200    )
201    if len(tomograms) == 0:
202        return None, None
203    if len(tomograms) > 1:
204        raise NotImplementedError
205    tomo = tomograms[0]
206
207    if use_zarr_format:
208        if output_path is None:
209            scale_level = 0 if scale_level is None else scale_level
210            data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs)
211        else:
212            # TODO: write the outuput to ome zarr, for all scale levels.
213            raise NotImplementedError
214    else:
215        if scale_level is not None:
216            raise ValueError
217        if output_path is None:
218            raise RuntimeError("You have to pass an output_path to download the data as mrc file.")
219        fs.get(tomo.s3_mrc_file, output_path)
220        data, voxel_size = read_mrc(output_path)
221
222    return data, voxel_size
def get_cache_dir() -> str:
25def get_cache_dir() -> str:
26    """Get the cache directory of synapse net.
27
28    Returns:
29        The cache directory.
30    """
31    cache_dir = os.path.expanduser(pooch.os_cache("synapse-net"))
32    return cache_dir

Get the cache directory of synapse net.

Returns:

The cache directory.

def get_data_path(folder: str, n_tomograms: Optional[int] = 1) -> Union[str, List[str]]:
35def get_data_path(folder: str, n_tomograms: Optional[int] = 1) -> Union[str, List[str]]:
36    """Get the path to all tomograms stored as .rec or .mrc files in a folder.
37
38    Args:
39        folder: The folder with tomograms.
40        n_tomograms: The expected number of tomograms.
41
42    Returns:
43        The filepath or list of filepaths of the tomograms in the folder.
44    """
45    file_names = os.listdir(folder)
46    tomograms = []
47    for fname in file_names:
48        ext = os.path.splitext(fname)[1]
49        if ext in (".rec", ".mrc"):
50            tomograms.append(os.path.join(folder, fname))
51
52    if n_tomograms is None:
53        return tomograms
54    assert len(tomograms) == n_tomograms, f"{folder}: {len(tomograms)}, {n_tomograms}"
55    return tomograms[0] if n_tomograms == 1 else tomograms

Get the path to all tomograms stored as .rec or .mrc files in a folder.

Arguments:
  • folder: The folder with tomograms.
  • n_tomograms: The expected number of tomograms.
Returns:

The filepath or list of filepaths of the tomograms in the folder.

def read_voxel_size(path: str) -> Optional[Dict[str, float]]:
73def read_voxel_size(path: str) -> Dict[str, float] | None:
74    """Read voxel size from mrc/rec file.
75
76    The original unit of voxel size is Angstrom and we convert it to nanometers by dividing it by ten.
77
78    Args:
79        path: Path to mrc/rec file.
80
81    Returns:
82        Mapping from the axis name to voxel size. None if the voxel size could not be read.
83    """
84    with mrcfile.open(path, permissive=True) as mrc:
85        voxel_size = _parse_voxel_size(mrc.voxel_size)
86    return voxel_size

Read voxel size from mrc/rec file.

The original unit of voxel size is Angstrom and we convert it to nanometers by dividing it by ten.

Arguments:
  • path: Path to mrc/rec file.
Returns:

Mapping from the axis name to voxel size. None if the voxel size could not be read.

def read_mrc(path: str) -> Tuple[numpy.ndarray, Dict[str, float]]:
 89def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]:
 90    """Read data and voxel size from mrc/rec file.
 91
 92    Args:
 93        path: Path to mrc/rec file.
 94
 95    Returns:
 96        The data read from the file.
 97        The voxel size read from the file.
 98    """
 99    with mrcfile.open(path, permissive=True) as mrc:
100        voxel_size = _parse_voxel_size(mrc.voxel_size)
101        data = np.asarray(mrc.data[:])
102    assert data.ndim in (2, 3)
103
104    # Transpose the data to match python axis order.
105    data = np.flip(data, axis=1) if data.ndim == 3 else np.flip(data, axis=0)
106    return data, voxel_size

Read data and voxel size from mrc/rec file.

Arguments:
  • path: Path to mrc/rec file.
Returns:

The data read from the file. The voxel size read from the file.

def read_ome_zarr( uri: str, scale_level: int = 0, fs=None) -> Tuple[numpy.ndarray, Dict[str, float]]:
109def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]:
110    """Read data and voxel size from an ome.zarr file.
111
112    Args:
113        uri: Path or url to the ome.zarr file.
114        scale_level: The level of the multi-scale image pyramid to load.
115        fs: S3 filesystem to use for initializing the store.
116
117    Returns:
118        The data read from the file.
119        The voxel size read from the file.
120    """
121    if zarr is None:
122        raise RuntimeError("The zarr library is required to read ome.zarr files.")
123
124    def parse_s3_uri(uri):
125        return uri.lstrip("s3://")
126
127    if uri.startswith("s3"):
128        if fs is None:
129            fs = s3fs.S3FileSystem(anon=True)
130        s3_uri = parse_s3_uri(uri)
131        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
132    elif fs is not None:
133        s3_uri = parse_s3_uri(uri)
134        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
135    else:
136        if not os.path.exists(uri):
137            raise ValueError(f"Cannot find the filepath at {uri}.")
138        store = uri
139
140    with zarr.open(store, "r") as f:
141        multiscales = f.attrs["multiscales"][0]
142
143        # Read the axis and transformation metadata for this dataset, to determine the voxel size.
144        axes = [axis["name"] for axis in multiscales["axes"]]
145        assert set(axes) == set("xyz")
146        units = [axis.get("unit", "angstrom") for axis in multiscales["axes"]]
147        assert all(unit in ("angstrom", "nanometer") for unit in units)
148
149        transformations = multiscales["datasets"][scale_level]["coordinateTransformations"]
150        scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0]
151
152        # Convert the given unit size to nanometer.
153        # (It is typically given in angstrom, and we have to divide by a factor of 10).
154        unit_factor = [10.0 if unit == "angstrom" else 1.0 for unit in units]
155        voxel_size = {axis: scale / factor for axis, scale, factor in zip(axes, scale_transformation, unit_factor)}
156
157        # Get the internale path for the given scale and load the data.
158        internal_path = multiscales["datasets"][scale_level]["path"]
159        data = f[internal_path][:]
160
161    return data, voxel_size

Read data and voxel size from an ome.zarr file.

Arguments:
  • uri: Path or url to the ome.zarr file.
  • scale_level: The level of the multi-scale image pyramid to load.
  • fs: S3 filesystem to use for initializing the store.
Returns:

The data read from the file. The voxel size read from the file.

def read_data_from_cryo_et_portal_run( run_id: int, output_path: Optional[str] = None, use_zarr_format: bool = True, processing_type: str = 'denoised', id_field: str = 'run_id', scale_level: Optional[int] = None) -> Tuple[numpy.ndarray, Dict[str, float]]:
164def read_data_from_cryo_et_portal_run(
165    run_id: int,
166    output_path: Optional[str] = None,
167    use_zarr_format: bool = True,
168    processing_type: str = "denoised",
169    id_field: str = "run_id",
170    scale_level: Optional[int] = None,
171) -> Tuple[np.ndarray, Dict[str, float]]:
172    """Read data and voxel size from a CryoET Data Portal run.
173
174    Args:
175        run_id: The ID of the experiment run.
176        output_path: The path for saving the data. The data will be streamed if the path is not given.
177        use_zarr_format: Whether to use the data in zarr format instead of mrc.
178        processing_type: The processing type of the tomogram to download.
179        id_field: The name of the id field. One of 'id' or 'run_id'.
180            The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data.
181        scale_level: The scale level to read from the data. Only valid for zarr data.
182
183    Returns:
184        The data read from the run.
185        The voxel size read from the run.
186    """
187    assert id_field in ("id", "run_id")
188    if output_path is not None and os.path.exists(output_path):
189        return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path)
190
191    if cdp is None:
192        raise RuntimeError("The CryoET data portal library is required to download data from the portal.")
193    if s3fs is None:
194        raise RuntimeError("The CryoET data portal download requires s3fs download.")
195
196    client = cdp.Client()
197
198    fs = s3fs.S3FileSystem(anon=True)
199    tomograms = cdp.Tomogram.find(
200        client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type]
201    )
202    if len(tomograms) == 0:
203        return None, None
204    if len(tomograms) > 1:
205        raise NotImplementedError
206    tomo = tomograms[0]
207
208    if use_zarr_format:
209        if output_path is None:
210            scale_level = 0 if scale_level is None else scale_level
211            data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs)
212        else:
213            # TODO: write the outuput to ome zarr, for all scale levels.
214            raise NotImplementedError
215    else:
216        if scale_level is not None:
217            raise ValueError
218        if output_path is None:
219            raise RuntimeError("You have to pass an output_path to download the data as mrc file.")
220        fs.get(tomo.s3_mrc_file, output_path)
221        data, voxel_size = read_mrc(output_path)
222
223    return data, voxel_size

Read data and voxel size from a CryoET Data Portal run.

Arguments:
  • run_id: The ID of the experiment run.
  • output_path: The path for saving the data. The data will be streamed if the path is not given.
  • use_zarr_format: Whether to use the data in zarr format instead of mrc.
  • processing_type: The processing type of the tomogram to download.
  • id_field: The name of the id field. One of 'id' or 'run_id'. The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data.
  • scale_level: The scale level to read from the data. Only valid for zarr data.
Returns:

The data read from the run. The voxel size read from the run.