OpenCOMPES · zain-sohail · Jun 30, 2024 · Nov 25, 2023 · Nov 25, 2023 · Nov 29, 2023
diff --git a/sed/config/flash_example_config.yaml b/sed/config/flash_example_config.yaml
@@ -9,6 +9,8 @@ core:
   beamtime_id: 11013410
   # the year of the beamtime
   year: 2023
+  # the instrument used
+  instrument: hextof # hextof, wespe, etc
 
   # The paths to the raw and parquet data directories. If these are not
   # provided, the loader will try to find the data based on year beamtimeID etc
@@ -52,18 +54,20 @@ dataframe:
   tof_ns_column: dldTime
   # dataframe column containing corrected time-of-flight data
   corrected_tof_column: "tm"
+  # the time stamp column
+  time_stamp_alias: timeStamp
   # time length of a base time-of-flight bin in seconds
   tof_binwidth: 2.0576131995767355E-11
   # binning parameter for time-of-flight data. 2**tof_binning bins per base bin
   tof_binning: 3  # power of 2, 3 means 8 bins per step
   # dataframe column containing sector ID. obtained from dldTimeSteps column
   sector_id_column: dldSectorID
-
   sector_delays: [0., 0., 0., 0., 0., 0., 0., 0.]
   # the delay stage column
   delay_column: delayStage
   # the corrected pump-probe time axis
   corrected_delay_column: pumpProbeTime
+  # the columns to be used for jitter correction
   jitter_cols: ["dldPosX", "dldPosY", "dldTimeSteps"]
 
   units:
@@ -95,39 +99,45 @@ dataframe:
     # The timestamp
     timeStamp:
       format: per_train
-      group_name: "/uncategorised/FLASH.DIAG/TIMINGINFO/TIME1.BUNCH_FIRST_INDEX.1/"
+      index_key: "/uncategorised/FLASH.DIAG/TIMINGINFO/TIME1.BUNCH_FIRST_INDEX.1/index"
+      dataset_key: "/uncategorised/FLASH.DIAG/TIMINGINFO/TIME1.BUNCH_FIRST_INDEX.1/time"
 
     # pulse ID is a necessary channel for using the loader.
     pulseId:
       format: per_electron
-      group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/"
+      index_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/index"
+      dataset_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/value"
       slice: 2
 
     # detector x position
     dldPosX:
       format: per_electron
-      group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/"
+      index_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/index"
+      dataset_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/value"
       slice: 1
 
     # detector y position
     dldPosY:
       format: per_electron
-      group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/"
+      index_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/index"
+      dataset_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/value"
       slice: 0
 
     # Detector time-of-flight channel
     # if split_sector_id_from_dld_time is set to True, This this will generate
     # also the dldSectorID channel
     dldTimeSteps:
       format: per_electron
-      group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/"
+      index_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/index"
+      dataset_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/value"
       slice: 3
 
     # The auxillary channel has a special structure where the group further contains
     # a multidimensional structure so further aliases are defined below
     dldAux:
       format: per_pulse
-      group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/"
+      index_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/index"
+      dataset_key: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/value"
       slice: 4
       dldAuxChannels:
         sampleBias: 0
@@ -141,29 +151,35 @@ dataframe:
     # ADC containing the pulser sign (1: value approx. 35000, 0: 33000)
     pulserSignAdc:
       format: per_pulse
-      group_name: "/FL1/Experiment/PG/SIS8300 100MHz ADC/CH6/TD/"
+      index_key: "/FL1/Experiment/PG/SIS8300 100MHz ADC/CH6/TD/index"
+      dataset_key: "/FL1/Experiment/PG/SIS8300 100MHz ADC/CH6/TD/value"
 
     # the energy of the monochromatized beam. This is a quasi-static value.
     # there is a better channel which still needs implementation.
     monochromatorPhotonEnergy:
       format: per_train
-      group_name: "/FL1/Beamlines/PG/Monochromator/monochromator photon energy/"
+      index_key: "/FL1/Beamlines/PG/Monochromator/monochromator photon energy/index"
+      dataset_key: "/FL1/Beamlines/PG/Monochromator/monochromator photon energy/value"
 
     # The GMDs can not be read yet...
     gmdBda:
       format: per_train
-      group_name: "/FL1/Photon Diagnostic/GMD/Average energy/energy BDA/"
+      index_key: "/FL1/Photon Diagnostic/GMD/Average energy/energy BDA/index"
+      dataset_key: "/FL1/Photon Diagnostic/GMD/Average energy/energy BDA/value"
+
     # Beam Arrival Monitor, vital for pump-probe experiments as it can compensate sase
     # timing fluctuations.
     # Here we use the DBC2 BAM as the "normal" one is broken.
     bam:
         format: per_pulse
-        group_name: "/uncategorised/FLASH.SDIAG/BAM.DAQ/FL0.DBC2.ARRIVAL_TIME.ABSOLUTE.SA1.COMP/"
+        index_key: "/uncategorised/FLASH.SDIAG/BAM.DAQ/FL0.DBC2.ARRIVAL_TIME.ABSOLUTE.SA1.COMP/index"
+        dataset_key: "/uncategorised/FLASH.SDIAG/BAM.DAQ/FL0.DBC2.ARRIVAL_TIME.ABSOLUTE.SA1.COMP/value"
 
     # The delay Stage position, encoding the pump-probe delay
     delayStage:
       format: per_train
-      group_name: "/zraw/FLASH.SYNC/LASER.LOCK.EXP/F1.PG.OSC/FMC0.MD22.1.ENCODER_POSITION.RD/dGroup/"
+      index_key: "/zraw/FLASH.SYNC/LASER.LOCK.EXP/F1.PG.OSC/FMC0.MD22.1.ENCODER_POSITION.RD/dGroup/index"
+      dataset_key: "/zraw/FLASH.SYNC/LASER.LOCK.EXP/F1.PG.OSC/FMC0.MD22.1.ENCODER_POSITION.RD/dGroup/value"
 
   # The prefixes of the stream names for different DAQ systems for parsing filenames
   # (Not to be changed by user)

diff --git a/sed/core/metadata.py b/sed/core/metadata.py
@@ -57,6 +57,8 @@ def _format_attributes(self, attributes: dict, indent: int = 0) -> str:
         INDENT_FACTOR = 20
         html = ""
         for key, value in attributes.items():
+            # Ensure the key is a string
+            key = str(key)
             # Format key
             formatted_key = key.replace("_", " ").title()
             formatted_key = f"<b>{formatted_key}</b>"

diff --git a/sed/loader/flash/buffer_handler.py b/sed/loader/flash/buffer_handler.py
@@ -0,0 +1,238 @@
+from __future__ import annotations
+
+import os
+from itertools import compress
+from pathlib import Path
+
+import dask.dataframe as dd
+import pyarrow.parquet as pq
+from joblib import delayed
+from joblib import Parallel
+
+from sed.core.dfops import forward_fill_lazy
+from sed.loader.flash.dataframe import DataFrameCreator
+from sed.loader.flash.utils import get_channels
+from sed.loader.flash.utils import initialize_paths
+from sed.loader.utils import get_parquet_metadata
+from sed.loader.utils import split_dld_time_from_sector_id
+
+
+class BufferHandler:
+    """
+    A class for handling the creation and manipulation of buffer files using DataFrameCreator.
+    """
+
+    def __init__(
+        self,
+        config: dict,
+    ) -> None:
+        """
+        Initializes the BufferHandler.
+
+        Args:
+            config (dict): The configuration dictionary.
+        """
+        self._config = config["dataframe"]
+        self.n_cores = config["core"].get("num_cores", os.cpu_count() - 1)
+
+        self.buffer_paths: list[Path] = []
+        self.missing_h5_files: list[Path] = []
+        self.save_paths: list[Path] = []
+
+        self.df_electron: dd.DataFrame = None
+        self.df_pulse: dd.DataFrame = None
+        self.metadata: dict = {}
+
+    def _schema_check(self) -> None:
+        """
+        Checks the schema of the Parquet files.
+
+        Raises:
+            ValueError: If the schema of the Parquet files does not match the configuration.
+        """
+        existing_parquet_filenames = [file for file in self.buffer_paths if file.exists()]
+        parquet_schemas = [pq.read_schema(file) for file in existing_parquet_filenames]
+        config_schema_set = set(
+            get_channels(self._config["channels"], formats="all", index=True, extend_aux=True),
+        )
+
+        for filename, schema in zip(existing_parquet_filenames, parquet_schemas):
+            # for retro compatibility when sectorID was also saved in buffer
+            if self._config["sector_id_column"] in schema.names:
+                config_schema_set.add(
+                    self._config["sector_id_column"],
+                )
+            schema_set = set(schema.names)
+            if schema_set != config_schema_set:
+                missing_in_parquet = config_schema_set - schema_set
+                missing_in_config = schema_set - config_schema_set
+
+                errors = []
+                if missing_in_parquet:
+                    errors.append(f"Missing in parquet: {missing_in_parquet}")
+                if missing_in_config:
+                    errors.append(f"Missing in config: {missing_in_config}")
+
+                raise ValueError(
+                    f"The available channels do not match the schema of file {filename}. "
+                    f"{' '.join(errors)}. "
+                    "Please check the configuration file or set force_recreate to True.",
+                )
+
+    def _get_files_to_read(
+        self,
+        h5_paths: list[Path],
+        folder: Path,
+        prefix: str,
+        suffix: str,
+        force_recreate: bool,
+    ) -> None:
+        """
+        Determines the list of files to read and the corresponding buffer files to create.
+
+        Args:
+            h5_paths (List[Path]): List of paths to H5 files.
+            folder (Path): Path to the folder for buffer files.
+            prefix (str): Prefix for buffer file names.
+            suffix (str): Suffix for buffer file names.
+            force_recreate (bool): Flag to force recreation of buffer files.
+        """
+        # Getting the paths of the buffer files, with subfolder as buffer and no extension
+        self.buffer_paths = initialize_paths(
+            filenames=[h5_path.stem for h5_path in h5_paths],
+            folder=folder,
+            subfolder="buffer",
+            prefix=prefix,
+            suffix=suffix,
+            extension="",
+        )
+        # read only the files that do not exist or if force_recreate is True
+        files_to_read = [
+            force_recreate or not parquet_path.exists() for parquet_path in self.buffer_paths
+        ]
+
+        # Get the list of H5 files to read and the corresponding buffer files to create
+        self.missing_h5_files = list(compress(h5_paths, files_to_read))
+        self.save_paths = list(compress(self.buffer_paths, files_to_read))
+
+        print(f"Reading files: {len(self.missing_h5_files)} new files of {len(h5_paths)} total.")
+
+    def _save_buffer_file(self, h5_path: Path, parquet_path: Path) -> None:
+        """
+        Creates a single buffer file.
+
+        Args:
+            h5_path (Path): Path to the H5 file.
+            parquet_path (Path): Path to the buffer file.
+        """
+
+        # Create a DataFrameCreator instance and the h5 file
+        df = DataFrameCreator(config_dataframe=self._config, h5_path=h5_path).df
+
+        # Reset the index of the DataFrame and save it as a parquet file
+        df.reset_index().to_parquet(parquet_path)
+
+    def _save_buffer_files(self, debug: bool) -> None:
+        """
+        Creates the buffer files.
+
+        Args:
+            debug (bool): Flag to enable debug mode, which serializes the creation.
+        """
+        n_cores = min(len(self.missing_h5_files), self.n_cores)
+        paths = zip(self.missing_h5_files, self.save_paths)
+        if n_cores > 0:
+            if debug:
+                for h5_path, parquet_path in paths:
+                    self._save_buffer_file(h5_path, parquet_path)
+            else:
+                Parallel(n_jobs=n_cores, verbose=10)(
+                    delayed(self._save_buffer_file)(h5_path, parquet_path)
+                    for h5_path, parquet_path in paths
+                )
+
+    def _fill_dataframes(self):
+        """
+        Reads all parquet files into one dataframe using dask and fills NaN values.
+        """
+        dataframe = dd.read_parquet(self.buffer_paths, calculate_divisions=True)
+        file_metadata = get_parquet_metadata(
+            self.buffer_paths,
+            time_stamp_col=self._config.get("time_stamp_alias", "timeStamp"),
+        )
+        self.metadata["file_statistics"] = file_metadata
+
+        fill_channels: list[str] = get_channels(
+            self._config["channels"],
+            ["per_pulse", "per_train"],
+            extend_aux=True,
+        )
+        index: list[str] = get_channels(index=True)
+        overlap = min(file["num_rows"] for file in file_metadata.values())
+
+        dataframe = forward_fill_lazy(
+            df=dataframe,
+            columns=fill_channels,
+            before=overlap,
+            iterations=self._config.get("forward_fill_iterations", 2),
+        )
+        self.metadata["forward_fill"] = {
+            "columns": fill_channels,
+            "overlap": overlap,
+            "iterations": self._config.get("forward_fill_iterations", 2),
+        }
+
+        # Drop rows with nan values in electron channels
+        df_electron = dataframe.dropna(
+            subset=get_channels(self._config["channels"], ["per_electron"]),
+        )
+
+        # Set the dtypes of the channels here as there should be no null values
+        channel_dtypes = get_channels(self._config["channels"], "all")
+        config_channels = self._config["channels"]
+        dtypes = {
+            channel: config_channels[channel].get("dtype")
+            for channel in channel_dtypes
+            if config_channels[channel].get("dtype") is not None
+        }
+
+        # Correct the 3-bit shift which encodes the detector ID in the 8s time
+        if self._config.get("split_sector_id_from_dld_time", False):
+            df_electron, meta = split_dld_time_from_sector_id(
+                df_electron,
+                config=self._config,
+            )
+            self.metadata.update(meta)
+
+        self.df_electron = df_electron.astype(dtypes)
+        self.df_pulse = dataframe[index + fill_channels]
+
+    def run(
+        self,
+        h5_paths: list[Path],
+        folder: Path,
+        force_recreate: bool = False,
+        prefix: str = "",
+        suffix: str = "",
+        debug: bool = False,
+    ) -> None:
+        """
+        Runs the buffer file creation process.
+
+        Args:
+            h5_paths (List[Path]): List of paths to H5 files.
+            folder (Path): Path to the folder for buffer files.
+            force_recreate (bool): Flag to force recreation of buffer files.
+            prefix (str): Prefix for buffer file names.
+            suffix (str): Suffix for buffer file names.
+            debug (bool): Flag to enable debug mode.):
+        """
+
+        self._get_files_to_read(h5_paths, folder, prefix, suffix, force_recreate)
+
+        if not force_recreate:
+            self._schema_check()
+
+        self._save_buffer_files(debug)
+
+        self._fill_dataframes()