6 - Morphological features extraction

Description: This notebook contains an example on how to read SRX localization data (in csv format) and extract morphological features to describe the shape of the trace.

These features together with descriptive statistics (number of localizations, mean position, mean precision, etc.) can be used to categorize individual segments.

The following index has links to the different sections of the notebook. Some sections may have additional indexes to access the subparts.
If you are using VS Code to visualize the notebook, the links will not work. This is due to how VC Code render the notebook. To navigate the notebook use the outline panel. To know more about it, check this link.

Content:

Library imports and functions

Back to the general index

import re
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import colormaps
import seaborn as sns
import pandas as pd
from scipy.stats import mannwhitneyu, median_abs_deviation

from cima.parsers.parser_csv import CSVParser
from cima.utils.walk_features import getSegmentsFeatures
from cima.utils.stats_function import statConvert

# Set the maximum number of rows to display in the output
pd.options.display.max_rows = 999

# Set the options for saving the plots
plot_opts = {"dpi": 300, "bbox_inches": 'tight', "transparent": True}

# Labels for the plots of the features, so they look nice
plot_labels={
    "volume":"Volume ($nm^3$)",
    "area":"Area ($nm^2$)",
    "sphericity":"Sphericity",
    "sphericity_bribiesca":"Sphericity Bribiesca",
    "density_kb":"Density (blink/bp)",
    "density":"Density (blink/$nm^3$)",
    "compaction":"Genomic compactness (bp/$nm^3$)",
    "solidity":"Solidity",
    "radius_of_gyration":"Radius of gyration",
    "point_cloud_ellipticity":"Ellipticity",
    "numerosity":"Numerosity"
    }

# When showing dataframes we want that this are the first columns, so the user can manually search for them (not recomended)
first_cols = ["experimentID", "nucleusID", "chromosome", "homolog", "timepoint", "name", "flag"]

## Regular expressions to get the metadata from the filename
regex_patterns = {
    "nucleusID": re.compile(r"(?i)nuc(\d+)"), # This searches for "Nuc" or "nuc" followed by digits
    "cellID": re.compile(r"(?i)cell(\d+)"), # This searches for "Cell" or "cell" followed by digits
    "locationID" : re.compile(r"(?i)loc-?(\d+)"), # This searches for "Loc" or "loc" followed by optional hyphen and digits
    "date" : re.compile(r"(\d{4}[-\.]\d{2}[-\.]\d{2})"), # This searches for dates in the format YYYY-MM-DD or YYYY.MM.DD
    "homolog" : re.compile(r"_([ABPM01pm])"), ## Added just in case the homolog is in the filename.
    "chromosome" : re.compile(r"(?i)(chr[\d+MXY])") ## Added just in case the chromosome is in the filename.
    }

# Dictionary to change all the data types of the columns of the different dataframes, so the notebook is mor memory
# efficient.
dtype_dict = {
    "experimentID" : "string", "nucleusID" : "string", "chromosome" : "string",
    "homolog" : "string", "timepoint" : np.int16, "Name" :  "string", "flag" : "string", "segment1" : np.int16,
    "segment1_timepoint" : np.int16, "segment2_name" : "string", "flag_seg1" : "string", "segment2" : np.int16,
    "segment2_timepoint" : np.int16, "segment2_name" : "string", "flag_seg2" : "string", "coms_distance" : np.float32,
    "surface_distance" : np.float32, "entanglement" : np.float32, "volume" : np.float32,
    "area" : np.float32, "sphericity_bribiesca" : np.float32, "sphericity" : np.float32, "radius_of_gyration" : np.float32,
    "point_cloud_ellipticity" : np.float32, "point_cloud_eccentricity" : np.float32, "mvee_ellipticity" : np.float32,
    "mvee_eccentricity" : np.float32, "solidity" : np.float32, "holeless_solidity" : np.float32,
    "numerosity" : np.uint32, "compaction" : np.float32, "density" : np.float32, "density_kb" : np.float32
}

# We create a color map that goes from white to red for the HiC plots (put it in CIMA)
if "bright_red" not in colormaps:
    colormaps.register(name="bright_red", cmap=LinearSegmentedColormap.from_list("bright_red", [(1,1,1),(1,0,0)]))
# Function to batch process morphological features
def Batch_process_morphological_features(obj_list: list,
                        dataframe_genomic_info: pd.DataFrame,
                        size_column: str = "Size(kb)",
                        stepname_column: str = "timepoint",
                        library_n_Hype: int = 16, 
                        metaInfo: list[str] = ["experimentID","nucleus","homolog"],
                        features2skip: list[str] = ["timepoint", "numerosity", "sphericity_bribiesca"],
                        prec_mean: float = 50.0, 
                        ncores: int = 1) -> pd.DataFrame:
    
    """
    Processes a list of objects to get the morphological features of each timepoint/genomic segment.
    
    Parameters
    ----------
    obj_list : list
        List of objects to process.
    dataframe_genomic_info : pd.DataFrame
        DataFrame containing genomic information.
    size_column : str, optional
        Name of the column in dataframe_genomic_info that contains the size information (default is "Size(kb)").
    stepname_column : str, optional
        Name of the column in dataframe_genomic_info that contains the step names (default is "timepoint").
    library_n_Hype : int, optional
        Maximum number of segments allowed (default is 16).
    metaInfo : list[str], optional
        List of metadata keys to extract from each object (default is ["experimentID","nucleus","homolog"]).
    features2skip : list[str], optional
        List of feature names to skip (default is ["timepoint", "numerosity", "sphericity_bribiesca"]). Timepoint is skipped to avoid having to columns with the same name.
    prec_mean : float, optional
        Precision mean value (default is 50.0).
    ncores : int, optional
        Number of cores to use for parallel processing (default is 1).

    Returns
    -------
    pd.DataFrame
        DataFrame containing the morphological features for each timepoint/genomic segment.
    """
    
    import warnings
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    # This is done to avoid a RuntimeWarning that happens in the GetEccentricity function. I reactivate it at the end of the loop.
    
    print(f"Calculating morphological features for {len(obj_list)} objects...")
    
    info_df_list=[]
    
    if "timepoint" not in features2skip:
        features2skip.append("timepoint")
    
    for obj_counter, obj in enumerate(obj_list, start = 1):

        experiment_id = obj.metadata[metaInfo[0]]
        nucleus_ID = obj.metadata[metaInfo[1]]
        homolog = obj.metadata[metaInfo[2]]
        nSteps=obj.no_of_Times()
        
        print(f"\033[1;4m\tProcessing file experimentID: {experiment_id}, nucleusID: {nucleus_ID}, homolog: {homolog} ({obj_counter}/{len(obj_list)}).\033[0m")
        
        if nSteps>library_n_Hype:
            print(f"\tWarning too many segment {nSteps} in experimentID: {experiment_id}, nucleusID: {nucleus_ID}, homolog: {homolog}")  
            pass
        else:

            list_CS=obj.split_into_time(return_list = True)

            dataframe_features=getSegmentsFeatures(list_CS, prec_mean=prec_mean, verbose=False, n_jobs = ncores)
            
            info_from_walk=pd.DataFrame({
                "timepoint" : [cs.atomList.timepoint.unique()[0] for cs in list_CS],
                "experimentID" : [experiment_id] * len(list_CS),
                "nucleusID" : [nucleus_ID] * len(list_CS),
                "homolog" : [homolog] * len(list_CS)})
            
            features_dataframe=info_from_walk.merge(dataframe_genomic_info, left_on="timepoint", right_on = "timepoint").merge(dataframe_features,left_on="timepoint", right_on="timepoint")

            features_dataframe["flag"] = features_dataframe[["experimentID","nucleusID","homolog","timepoint"]].astype(str).apply('_'.join, axis=1)

            features_dataframe["compaction"] = features_dataframe[size_column] * 1000 / features_dataframe["volume"]
            features_dataframe["density"] = features_dataframe["numerosity"] / features_dataframe["volume"]
            features_dataframe["density_kb"] = features_dataframe["numerosity"] / features_dataframe[size_column] * 1000
            
            features_dataframe = features_dataframe[["experimentID", "nucleusID", "homolog", "timepoint", stepname_column, "flag"] +
                                                    [col for col in dataframe_features.columns if col not in features2skip] +
                                                    ["compaction", "density", "density_kb"]]
            
            info_df_list.append(features_dataframe)

    print("Creating the final DataFrame with all the morphological features for the different timepoints across experiments")
    dataframe_final_morph = pd.concat(info_df_list, ignore_index=True)
    
    warnings.filterwarnings("default", category=RuntimeWarning)
    
    print("Morphological features calculation completed.")
    
    return dataframe_final_morph

Setting some variables

In the following cell you should set up some variables for the script to run properly. Change the variables at your convenience, and then you can press “Run all” in the Jupyter Notebook.

Brief explanation of the different variables:

  • work_folder: this variable is where the plots and the tsv files that the script creates will be stored.

  • data_file: this variable is where the csv files to be assessed are stored.

  • info_file: file that contains information about your walks, such as the chromosome, start and end of the steps. You need a columns that matches the 'timepoint' column in the csv files, to be able to show proper information.

  • column_round: the column that matches the 'timepoint' column in the csv files.

  • column_name: the column with the "nice" name for the steps. This will be used instead of the timepoint number. Use "" in case the information file does not have this column (in this case the name of the steps will be in the form of 'StepN' where N is an incremental number form 1 to the max number of steps).

  • column_size: the column for the size of the segment. Use "" in case the information file does not have this column (in this case the notebook will calculate the size automatically of the segments according to the start and end positions in the file).

  • column_start_pos: the column of the start position of the timepoint.

  • column_end_pos: the column of the start position of the timepoint.

  • number_of_steps: the number of steps in your library.

  • starting_step: the step at which analysis should start.

  • prec_mean_dataset: the worst precision of the three axes (usually the Z). This will be used as a blurring factor when creating the 3D maps to calculate the volumetric features.

  • save_plots: save the plots into a folder or show them in the notebook.

  • file_suffix: add a suffix to the different files created, in case you wish to save the results from different assessments in the same folder. It will be added as "_suffix" before the .tsv/.pdf.

The notebook will create a folder in the work_folder path called 6_morphological_features_extraction. Inside this folder 2 subfolders will be created, one for tsvs and one for plots. Inside each folder, a sub-subfolder with the suffix name will have the plots/tsvs for that experiment.

6_morphological_features_extraction/
|-- tsvs/
|   |-- example_tsv_SUFFIX.tsv
|-- plots/
|   |-- example_plot_SUFFIX.pdf

Back to the general index

work_folder = "/scratch/CIMA_tutorial/"
data_folder = "/scratch/CIMA_tutorial/data/chr3"

info_file = "/scratch/CIMA_tutorial/data/chr3/Info_chr3.txt"
column_round = "Round"
column_name = "Name"
column_size = "Size(kb)"
column_chr = "Chr"
column_start_pos = "Start(hg19)"
column_end_pos = "End(hg19)"

number_of_steps_library = 16
starting_step = 6

prec_mean_dataset = 30

save_plots = False

file_suffix = "chr3"
work_folder = Path(work_folder)
data_folder = Path(data_folder)
pre_assessment_folder = Path(work_folder, "2_experimental_quality_assessment")
reconstruction_tsv_folder = Path(work_folder, "3_3D_reconstruction_assessment", "tsvs", file_suffix)

info_df = pd.read_table(Path(info_file), sep="\t")
info_df.columns = info_df.columns.str.strip() # In case there are leading or trailing spaces in the column names

info_file_dict = {
    "column_round" : column_round,
    "column_name" : column_name,
    "column_size" : column_size,
    "column_chr" : column_chr,
    "column_start_pos" : column_start_pos,
    "column_end_pos" : column_end_pos,
}

del column_round, column_name, column_size, column_chr, column_start_pos, column_end_pos, info_file

Exporting the data with CIMA

Content:

Back to the general index

Creating the list of CIMA StructuralObject

In this cell we create a list of StructuralObjects (the main CIMA class type).
This list is the basis to do the assessment through this Jupyter Notebook.

When loading the data, we will also take out those traces or CS that did not pass the assessment in the previous notebooks.

When reading the file, it will also add some metadata gathered from the filename of the csv file:

  • nucleusID: searches for "nuc" (with any combination of lower and upper case) followed by digits. If there is not a match, the default is filecount (first file read will be number 1, ...).

  • cellID: searches for "cell" (with any combination of lower and upper case) followed by digits. If there is not a match, the default is filecount (first file read will be number 1, ...).

  • locationID: searches for "loc" (with any combination of lower and upper case) followed by optional hyphen and digits.If there is not a match, the default is filecount (first file read will be number 1, ...).

  • date: searches for dates in the format YYYY-MM-DD or YYYY.MM.DD. If there is not a match, the default is the file name.

  • homolog: searches for "_" followed by A, B, M, P, m, p, 0 or 1 (only one instance of this). If there is not a match, the default is "A".

  • chromosome: searches for "chr" (with any combination of lower and upper case) followed by a number, M, X or Y. If there is not a match, the default is "chrN".

Back to the section index

# Get a list of all CSV files in the specified data folder
csv_files = list(data_folder.glob('*.csv'))

if len(csv_files) == 0:
    print("No files found, please input the correct work_folder")
else:
    print(f"Total number of files found: {len(csv_files)}")

print("Loading the experimental quality data from the CSV files")
traces_pass = pd.read_csv(Path(pre_assessment_folder, f"experimental_assessment_pass_{file_suffix}.txt"), sep="\t", names=["trace"])

print("Loading the bad flags from the CSV files")
segments_to_remove_CCC = pd.read_csv(Path(reconstruction_tsv_folder, f'segments2remove_bycccvariation_t2{("_" + file_suffix) if file_suffix != "" else ""}.tsv'), sep="\t")
if "flag" not in segments_to_remove_CCC.columns and not segments_to_remove_CCC.empty:
    print("\tCreating the 'flag' column for reconstruction-based removal")
    segments_to_remove_CCC["flag"] = segments_to_remove_CCC[["experimentID", "nucleusID", "homolog", "timepoint"]].astype(str).apply("_".join, axis=1)
    bad_flags = set(segments_to_remove_CCC["flag"].astype(str).values)
elif not segments_to_remove_CCC.empty:
    bad_flags = set(segments_to_remove_CCC["flag"].astype(str).values)
else:
    bad_flags = set()

# Get a list of metadata keys to extract from filenames
metadata_keys = ["nucleusID", "cellID", "locationID", "date", "homolog", "chromosome"]

# Initialize an empty list to store the StructuralObject instances
obj_list = []

if len(csv_files) > 0:
    # Iterate over each file in the list of file names
    for count, filein in enumerate(csv_files, start=1):

        stem = Path(filein).stem

        if not stem in traces_pass["trace"].values:
            print(f"Skipping file {count}: {stem} because it did not pass the experimental quality assessment.")
            continue

        print(f"Processing file {count}: {stem}", end="\r")

        # We mine the metadata from the filename using regex patterns. If not possible, assign default values.
        # In the case of nucleus, cell and location, the default is the filecount (first file read will be number 1, ...)
        # In the case of date, if no date can be found, the filename gets used.
        # If no valid homolog denomination is found, the default value is A.
        # If no valid chromosome denomination is found, the default value is chrN.
        metadata_CIMA = {
            key: (match.group(1) if (match := regex_patterns[key].search(stem)) else default)
            for key, default in zip(metadata_keys, [count, count, count, stem, "A", "chrN"])
        }

        # Adjust the metadata for nucleus and cell (if nucleus is count, but cell is something, use the cell match in nucleus)
        if "cellID" in metadata_CIMA and metadata_CIMA["nucleusID"] == count:
            metadata_CIMA["nucleusID"] = metadata_CIMA["cellID"]

        # Set experimentID as date for consistency
        metadata_CIMA["experimentID"] = metadata_CIMA["date"]

        # Read the CSV file and create a StructuralObject instance
        objin = CSVParser.read_CSV_file(filein.as_posix(), metadata = metadata_CIMA, content_type = "srx")

        # Filter the atomList to include only the specified timepoints in the info file
        objin.atomList = objin.atomList[(objin.atomList['timepoint'] >= starting_step - 1) &
                                        (objin.atomList['timepoint'] < (starting_step + number_of_steps_library - 1))].copy()
        
        experiment_id = objin.metadata["experimentID"]
        nucleus_id = int(objin.metadata["nucleusID"])
        homolog = objin.metadata["homolog"]
        
        full_flags = (experiment_id + "_" + str(nucleus_id) + "_" +homolog + "_" + objin.atomList["timepoint"].astype(str))
        remove_mask = full_flags.isin(bad_flags)

        if remove_mask.any():
            removed_tps = objin.atomList.loc[remove_mask, "timepoint"].unique()
            # removed_tps = ",".join([tp_names[tp] for tp in removed_tps])
            # print(f"Excluding timepoints {removed_tps} in experimentID: {experiment_id}, nucleusID: {nucleus_id}, homolog: {homolog} due to poor assessment.")
            objin.atomList = objin.atomList.loc[~remove_mask].copy()

        for tp in objin.atomList["timepoint"].unique():
            # Take out the timepoint from the atomList if it has less tha 10 points
            tp_mask = objin.atomList["timepoint"] == tp
            if tp_mask.sum() < 10:
                print(f"\tExcluding timepoint {tp} in experimentID: {experiment_id}, nucleusID: {nucleus_id}, homolog: {homolog} because it has less than 10 points.")
                objin.atomList = objin.atomList.loc[~tp_mask].copy()
        
        # Append the object to the list
        obj_list.append(objin)

    # Nice print to tell the user the number of objects created.
    print(f"\nTotal number of objects created: {len(obj_list)}")

    # Clean up memory by deleting some of the variables no longer needed
    del count, filein, stem, metadata_CIMA, match, objin

del csv_files, metadata_keys
del bad_flags
del experiment_id, nucleus_id, homolog, full_flags, remove_mask
if 'segments_to_remove_precision' in locals():
    del segments_to_remove_CCC
Total number of files found: 33
Loading the experimental quality data from the CSV files
Loading the bad flags from the CSV files
	Creating the 'flag' column for reconstruction-based removal
Skipping file 2: 2018-07-10_nuc04_A_chr3 because it did not pass the experimental quality assessment.
Skipping file 5: 2018-07-10_nuc03_A_chr3 because it did not pass the experimental quality assessment.
	Excluding timepoint 19 in experimentID: 2018-09-04, nucleusID: 7, homolog: A because it has less than 10 points.
Skipping file 26: 2018-09-04_nuc01_A_chr3 because it did not pass the experimental quality assessment.
Processing file 33: 2018-09-04_nuc06_A_chr3
Total number of objects created: 30

Checking the information file

This cells makes an adjustment in the info_df.

It checks if the column of the info_df that we will use as the link between this data frame and our walks has the same timepoints. This happens because most programs start with timepoint 0, while most information files start at timepoint 1.

This will create a new column in the dataframe that is called ‘timepoint’. This column will be modified so it has the same range as the timepoint column from the csv files.

Back to the section index

list_of_timepoints = sorted({int(tp) for obj in obj_list for tp in obj.atomList.timepoint.unique()})
info_of_timepoints = list(map(int, info_df[info_file_dict["column_round"]].tolist()))

print(f'List of timepoints found across all objects: {list_of_timepoints}')
print(f'List of timepoints in the information file: {info_of_timepoints}\n')

if list_of_timepoints == info_of_timepoints:
    print('The timepoints found in the objects match those in the information file.')
    info_df["timepoint"] = info_df[info_file_dict["column_round"]]
elif set(list_of_timepoints).issubset(info_of_timepoints):
    print('Note: The timepoints found in the objects are a subset of those in the information file. Not all timepoints are present.')
    info_df["timepoint"] = info_df[info_file_dict["column_round"]]
elif list_of_timepoints == [x - 1 for x in info_of_timepoints]:
    print('Note: The column for the information file that has the timepoint is offset by 1 compared to the objects, adjusting accordingly.')
    info_df["timepoint"] = info_df[info_file_dict["column_round"]] - 1
elif set(list_of_timepoints).issubset({x - 1 for x in info_of_timepoints}):
    print('Note: The timepoints found in the objects are a subset of those in the information file, which are offset by 1 compared to the objects. Adjusting accordingly.')
    info_df["timepoint"] = info_df[info_file_dict["column_round"]] - 1
else:
    print('Warning: The timepoints found in the objects do not match those in the information file.')

if info_file_dict["column_size"] == "":
    print("Calculating size column from start and end positions...")
    column_size = "size"
    info_file_dict["column_size"] = column_size
    info_df[column_size] = info_df[info_file_dict["column_end_pos"]] - info_df[info_file_dict["column_start_pos"]]

if info_file_dict["column_name"] == "":
    print("Setting name column from round numbers...")
    info_file_dict["column_name"] = "name"
    info_df["name"] = [f"Step{i}" for i in range(1, info_df.shape[0]+1)]

tp_names = info_df.set_index("timepoint")[info_file_dict["column_name"]].to_dict()
    
del list_of_timepoints, info_of_timepoints
List of timepoints found across all objects: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
List of timepoints in the information file: [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

Note: The column for the information file that has the timepoint is offset by 1 compared to the objects, adjusting accordingly.

Creating a color palette and the results folders for the tsv and plot files

Here we create a color palette automatically based on the different experiment ID found across the files. Experiment IDs are usually the date of the experiment. This allows to easily check for a possible batch effect.

We also create the folders were to store the results.

  • 6_morphological_features_extraction/tsvs: a subfolder using the suffix name will be created, where the different tsv files will be stored.

  • 6_morphological_features_extraction/plots: a subfolder using the suffix name will be created, where the different plots (in pdf format) will be stored.

Back to the section index

# Define the color palette for experiments. Use the date or the experimentID (date) as the key.
color_palette = {exp_date: color for exp_date, color in zip(
    # We get a set of unique dates from the metadata of the objects
    set([file.metadata["experimentID"] for file in obj_list]),
    # We assign a color from the tab10 palette for each unique date
    sns.color_palette("tab10", n_colors=len(set([file.metadata["experimentID"] for file in obj_list]))) 
)}

# This palette is fixed for the current experiments used in the tutorial, which aim to reproduce the figures in the CIMA paper.
color_palette = {'2018-01-11':"#a6cee3" ,'2018-06-28':'#1f78b4', '2018-07-10':'#b2df8a', '2018-09-04':'#33a02c'}

# Create the folders to save the results. If the folders already exist, do not raise an error.
tsv_folder = Path(work_folder, "6_morphological_features_extraction", "tsvs", file_suffix)
tsv_folder.mkdir(parents=True, exist_ok=True)

plots_folder = Path(work_folder, "6_morphological_features_extraction", "plots", file_suffix)
plots_folder.mkdir(parents=True, exist_ok=True)

Extracting the morphological features

Content:

Back to the general index

3D features calculation/loading

In this cell we check if the the morphological info has been calculated/loaded.
If it is not loaded (or it is not a pandas DataFrame), we try to load it from a file.
In case it said file does not exists it is calcualted, poorly assesed segments filtered out and saved.

Back to the section index

if "features_3d_df" not in globals() or not isinstance(features_3d_df, pd.DataFrame):
    
    filename = f"morphological_features_{prec_mean_dataset}{('_' + file_suffix if file_suffix != '' else '')}.tsv"
    
    print("Trying to read pre-calculated morphological features from TSV file...")
    try:
        features_3d_df = pd.read_csv(Path(tsv_folder, filename), sep='\t')

        if "flag" not in features_3d_df.columns:
            features_3d_df["flag"] = features_3d_df[["experimentID","nucleusID","homolog","timepoint"]].astype(str).apply('_'.join, axis=1)

        print("Pre-calculated morphological features loaded from TSV file.")

    except FileNotFoundError:

        print("Pre-calculated TSV file not found. Calculating morphological features from scratch...")
        features_3d_df = Batch_process_morphological_features(
            obj_list = obj_list, 
            dataframe_genomic_info = info_df,
            size_column = info_file_dict["column_size"],
            stepname_column=info_file_dict["column_name"],
            library_n_Hype = number_of_steps_library,
            metaInfo = ["experimentID","nucleusID","homolog"],
            features2skip = [],
            prec_mean = prec_mean_dataset,
            ncores = 1
        )

        print("Saving morphological features to TSV file...", end="\r")
        features_3d_df.drop(columns=["flag"]).to_csv(Path(tsv_folder, filename), sep='\t', index=False)
        print("Saving morphological features to TSV file... Done!")

        print("Morphological features calculated and saved to TSV file.")
    
    del filename

else:
    print("Morphological features DataFrame already exists in memory.")

if "flag2" not in features_3d_df.columns:
    features_3d_df["flag2"] = features_3d_df[["experimentID","nucleusID","homolog"]].astype(str).apply('_'.join, axis=1)

features_3d_df = features_3d_df.sort_values(by="timepoint")
display(features_3d_df.head())
Trying to read pre-calculated morphological features from TSV file...
Pre-calculated TSV file not found. Calculating morphological features from scratch...
Calculating morphological features for 30 objects...
	Processing file experimentID: 2018-09-04, nucleusID: 03, homolog: A (1/30).
	Processing file experimentID: 2018-07-10, nucleusID: 02, homolog: A (2/30).
	Processing file experimentID: 2018-07-10, nucleusID: 05, homolog: A (3/30).
	Processing file experimentID: 2018-06-28, nucleusID: 07, homolog: A (4/30).
	Processing file experimentID: 2018-06-28, nucleusID: 09, homolog: A (5/30).
	Processing file experimentID: 2018-07-10, nucleusID: 11, homolog: A (6/30).
	Processing file experimentID: 2018-09-04, nucleusID: 01, homolog: B (7/30).
	Processing file experimentID: 2018-07-10, nucleusID: 06, homolog: B (8/30).
	Processing file experimentID: 2018-06-28, nucleusID: 05, homolog: B (9/30).
	Processing file experimentID: 2018-07-10, nucleusID: 06, homolog: A (10/30).
	Processing file experimentID: 2018-06-28, nucleusID: 08, homolog: A (11/30).
	Processing file experimentID: 2018-06-28, nucleusID: 05, homolog: A (12/30).
	Processing file experimentID: 2018-07-10, nucleusID: 07, homolog: A (13/30).
	Processing file experimentID: 2018-07-10, nucleusID: 01, homolog: A (14/30).
	Processing file experimentID: 2018-07-10, nucleusID: 02, homolog: B (15/30).
	Processing file experimentID: 2018-06-28, nucleusID: 01, homolog: A (16/30).
	Processing file experimentID: 2018-09-04, nucleusID: 04, homolog: A (17/30).
	Processing file experimentID: 2018-07-10, nucleusID: 04, homolog: B (18/30).
	Processing file experimentID: 2018-09-04, nucleusID: 03, homolog: B (19/30).
	Processing file experimentID: 2018-07-10, nucleusID: 03, homolog: B (20/30).
	Processing file experimentID: 2018-07-10, nucleusID: 01, homolog: B (21/30).
	Processing file experimentID: 2018-09-04, nucleusID: 07, homolog: A (22/30).
	Processing file experimentID: 2018-06-28, nucleusID: 02, homolog: A (23/30).
	Processing file experimentID: 2018-07-10, nucleusID: 10, homolog: A (24/30).
	Processing file experimentID: 2018-07-10, nucleusID: 08, homolog: A (25/30).
	Processing file experimentID: 2018-09-04, nucleusID: 05, homolog: A (26/30).
	Processing file experimentID: 2018-06-28, nucleusID: 01, homolog: B (27/30).
	Processing file experimentID: 2018-09-04, nucleusID: 10, homolog: A (28/30).
	Processing file experimentID: 2018-06-28, nucleusID: 07, homolog: B (29/30).
	Processing file experimentID: 2018-09-04, nucleusID: 06, homolog: A (30/30).
Creating the final DataFrame with all the morphological features for the different timepoints across experiments
Morphological features calculation completed.
Saving morphological features to TSV file... Done!
Morphological features calculated and saved to TSV file.
experimentID nucleusID homolog timepoint Name flag volume area sphericity_bribiesca sphericity ... point_cloud_eccentricity mvee_ellipticity mvee_eccentricity solidity holeless_solidity numerosity compaction density density_kb flag2
73 2018-07-10 11 A 5 Step1 2018-07-10_11_A_5 45549000.0 1.334969e+06 0.849249 0.462025 ... 0.631109 0.811229 0.681738 0.147928 0.147928 4029.0 0.010977 0.000088 8058.0 2018-07-10_11_A
88 2018-09-04 01 B 5 Step1 2018-09-04_01_B_5 56322000.0 1.477865e+06 0.875486 0.480804 ... 0.223474 0.786694 0.577798 0.338188 0.338188 7982.0 0.008878 0.000142 15964.0 2018-09-04_01_B
363 2018-07-10 08 A 5 Step1 2018-07-10_08_A_5 24165000.0 7.481200e+05 0.859796 0.540303 ... 0.287081 0.810575 0.723042 0.252932 0.252932 4138.0 0.020691 0.000171 8276.0 2018-07-10_08_A
287 2018-07-10 03 B 5 Step1 2018-07-10_03_B_5 85833000.0 2.267695e+06 0.864498 0.414956 ... 0.274333 0.679143 0.652617 0.218693 0.218693 8507.0 0.005825 0.000099 17014.0 2018-07-10_03_B
256 2018-07-10 04 B 5 Step1 2018-07-10_04_B_5 29268000.0 9.034645e+05 0.854050 0.508357 ... 0.205049 0.699012 0.601044 0.361997 0.361997 2320.0 0.017084 0.000079 4640.0 2018-07-10_04_B

5 rows × 22 columns

Plot 3D feature per segment

Boxplots of the different morphological features calculated, separated by chromatin segment

Back to the section index

labels = features_3d_df[info_file_dict["column_name"]].unique().astype(str).tolist()

features_3d_df = features_3d_df[features_3d_df["point_cloud_ellipticity"] <= 0.8].copy()

for feature in ["volume", "area", "density_kb",
                "sphericity_bribiesca", "sphericity", "radius_of_gyration",
                "point_cloud_ellipticity",  "solidity","compaction", "numerosity"]:
    
    print(feature.capitalize())
    
    plt.figure(figsize=(10,3))
    sns.boxplot(x = features_3d_df[info_file_dict["column_name"]],
                y = features_3d_df[feature],
                hue = features_3d_df[info_file_dict["column_name"]], palette="turbo")
    sns.stripplot(x = features_3d_df[info_file_dict["column_name"]], y = features_3d_df[feature],
                  color='grey', alpha = 0.5)

    plt.ylabel(plot_labels[feature])
    plt.xlabel("Walk segment #ID")
    plt.ticklabel_format(style='sci',axis="y",scilimits=(0,0))
    plt.xticks(ticks=range(len(labels)), labels=labels, rotation=45, ha="right")    

    sns.despine()
    
    if save_plots:
        filename = f'boxplot_{feature}_segment' + (f"_{file_suffix}" if file_suffix != '' else '') + '.pdf'
        plot_filename = Path(plots_folder, filename)
        print(f"Saving plots to: {plot_filename}")
        plt.savefig(plot_filename, **plot_opts)
        del filename, plot_filename
    else:
        plt.show()
    plt.close()
    
    stats_df=[]
    all_val = [float(x) for x in features_3d_df[feature].tolist()]
    grps = features_3d_df[info_file_dict["column_name"]].unique()
    d_data = {grp:features_3d_df[features_3d_df[info_file_dict["column_name"]] == grp][feature].tolist() for grp in grps}
            
    for grp in grps:
        T,p=mannwhitneyu(all_val,d_data[grp])
        stats_df.append( [grp,f"{p:.2f}",statConvert(f"{p:.2f}")])
        
    stats_df=pd.DataFrame(stats_df,
                          columns=["Segment name","Mann-Whitney U p-value","converted p-value"])
    
    print("p-values of Mann-Whitney U test comparing each segment against all the others:")
    display(stats_df)
    print()
    
del feature, stats_df, all_val, grps, d_data, grp, T, p, labels
Volume
../_images/135c0d8eb8092d6734d4e6116b7c8d00aafbe5189e8893708aa1434a5835723a.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.08 8.00E-2
1 Step2 0.00 ****
2 Step3 0.79 7.90E-1
3 Step4 0.08 8.00E-2
4 Step5 0.26 2.60E-1
5 Step6 0.13 1.30E-1
6 Step7 0.01 **
7 Step8 0.00 ****
8 Step9 0.45 4.50E-1
9 Step10 0.53 5.30E-1
10 Step11 0.08 8.00E-2
11 Step12 0.32 3.20E-1
12 Step13 0.35 3.50E-1
13 Step14 0.44 4.40E-1
14 Step15 0.72 7.20E-1
15 Step16 0.00 ****
Area
../_images/802a8c37311b9fec9d01538219170c3b49f924f27bbac40df94d450f3dcc8934.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.18 1.80E-1
1 Step2 0.00 ****
2 Step3 0.87 8.70E-1
3 Step4 0.10 1.00E-1
4 Step5 0.48 4.80E-1
5 Step6 0.24 2.40E-1
6 Step7 0.10 1.00E-1
7 Step8 0.00 ****
8 Step9 0.56 5.60E-1
9 Step10 0.64 6.40E-1
10 Step11 0.30 3.00E-1
11 Step12 0.50 5.00E-1
12 Step13 0.66 6.60E-1
13 Step14 0.42 4.20E-1
14 Step15 0.99 9.90E-1
15 Step16 0.00 ****
Density_kb
../_images/50f3e696a8e9b3f158656dbfdf90be2a52364fe0ea937948b78eea738ab966d1.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.59 5.90E-1
1 Step2 0.00 ****
2 Step3 0.10 1.00E-1
3 Step4 0.00 ****
4 Step5 0.01 **
5 Step6 0.00 ****
6 Step7 0.00 ****
7 Step8 0.01 **
8 Step9 0.25 2.50E-1
9 Step10 0.19 1.90E-1
10 Step11 0.00 ****
11 Step12 0.09 9.00E-2
12 Step13 0.16 1.60E-1
13 Step14 0.14 1.40E-1
14 Step15 0.52 5.20E-1
15 Step16 0.00 ****
Sphericity_bribiesca
../_images/a7077ead05df9f193bf49d2e58e3605e8f8432651de3c05526793398c20404fb.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.19 1.90E-1
1 Step2 0.00 ****
2 Step3 0.96 9.60E-1
3 Step4 0.79 7.90E-1
4 Step5 0.33 3.30E-1
5 Step6 0.16 1.60E-1
6 Step7 0.01 **
7 Step8 0.14 1.40E-1
8 Step9 0.14 1.40E-1
9 Step10 0.64 6.40E-1
10 Step11 0.02 *
11 Step12 0.22 2.20E-1
12 Step13 0.17 1.70E-1
13 Step14 0.75 7.50E-1
14 Step15 0.09 9.00E-2
15 Step16 0.40 4.00E-1
Sphericity
../_images/b7fe5d20f1f3ce7dc8731644a6d198c733ede3a1b9f0ff05ee9540fd7e5861ce.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.73 7.30E-1
1 Step2 0.87 8.70E-1
2 Step3 0.83 8.30E-1
3 Step4 0.30 3.00E-1
4 Step5 0.79 7.90E-1
5 Step6 0.91 9.10E-1
6 Step7 0.70 7.00E-1
7 Step8 0.00 ****
8 Step9 0.86 8.60E-1
9 Step10 0.94 9.40E-1
10 Step11 0.43 4.30E-1
11 Step12 0.74 7.40E-1
12 Step13 0.44 4.40E-1
13 Step14 0.51 5.10E-1
14 Step15 0.31 3.10E-1
15 Step16 0.00 ****
Radius_of_gyration
../_images/94b478c1d1ad86d37f95eac971c20d1187d196efd168e951d3049f70960ae889.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.31 3.10E-1
1 Step2 0.00 ****
2 Step3 0.03 *
3 Step4 0.03 *
4 Step5 0.72 7.20E-1
5 Step6 0.27 2.70E-1
6 Step7 0.43 4.30E-1
7 Step8 0.00 ****
8 Step9 0.93 9.30E-1
9 Step10 0.16 1.60E-1
10 Step11 0.52 5.20E-1
11 Step12 0.83 8.30E-1
12 Step13 0.99 9.90E-1
13 Step14 0.81 8.10E-1
14 Step15 0.04 *
15 Step16 0.00 ****
Point_cloud_ellipticity
../_images/6fbf8d6003ee44d5a0feff538e33634cb330da5af455d2b705a30908ed9fdb35.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.30 3.00E-1
1 Step2 0.35 3.50E-1
2 Step3 0.09 9.00E-2
3 Step4 0.46 4.60E-1
4 Step5 0.05 *
5 Step6 0.98 9.80E-1
6 Step7 0.49 4.90E-1
7 Step8 0.67 6.70E-1
8 Step9 0.67 6.70E-1
9 Step10 0.15 1.50E-1
10 Step11 0.75 7.50E-1
11 Step12 0.65 6.50E-1
12 Step13 0.53 5.30E-1
13 Step14 0.13 1.30E-1
14 Step15 0.33 3.30E-1
15 Step16 0.74 7.40E-1
Solidity
../_images/befab189700d65bdb10bd3c8e1d425d9d760991015fc2a3c65aaec2dff305163.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.51 5.10E-1
1 Step2 0.06 6.00E-2
2 Step3 0.06 6.00E-2
3 Step4 0.26 2.60E-1
4 Step5 0.68 6.80E-1
5 Step6 0.53 5.30E-1
6 Step7 0.95 9.50E-1
7 Step8 0.00 ****
8 Step9 0.68 6.80E-1
9 Step10 0.73 7.30E-1
10 Step11 0.82 8.20E-1
11 Step12 0.96 9.60E-1
12 Step13 0.94 9.40E-1
13 Step14 0.56 5.60E-1
14 Step15 0.15 1.50E-1
15 Step16 0.00 ****
Compaction
../_images/0bb5e6f627a09474ff3920b8a4cb90dfc905ae7a212cce8bdf4bd1649f888813.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.08 8.00E-2
1 Step2 0.00 ****
2 Step3 0.79 7.90E-1
3 Step4 0.08 8.00E-2
4 Step5 0.26 2.60E-1
5 Step6 0.13 1.30E-1
6 Step7 0.01 **
7 Step8 0.00 ****
8 Step9 0.45 4.50E-1
9 Step10 0.53 5.30E-1
10 Step11 0.08 8.00E-2
11 Step12 0.32 3.20E-1
12 Step13 0.35 3.50E-1
13 Step14 0.44 4.40E-1
14 Step15 0.72 7.20E-1
15 Step16 0.00 ****
Numerosity
../_images/45cc7c470753cac46ce897974956a94cc39f13ffac93b6bc8bede47295f19f91.png
p-values of Mann-Whitney U test comparing each segment against all the others:
Segment name Mann-Whitney U p-value converted p-value
0 Step1 0.59 5.90E-1
1 Step2 0.00 ****
2 Step3 0.10 1.00E-1
3 Step4 0.00 ****
4 Step5 0.01 **
5 Step6 0.00 ****
6 Step7 0.00 ****
7 Step8 0.01 **
8 Step9 0.25 2.50E-1
9 Step10 0.19 1.90E-1
10 Step11 0.00 ****
11 Step12 0.09 9.00E-2
12 Step13 0.16 1.60E-1
13 Step14 0.14 1.40E-1
14 Step15 0.52 5.20E-1
15 Step16 0.00 ****

feature2plot = "point_cloud_ellipticity"

mean_feature = features_3d_df[feature2plot].mean()
std_feature = features_3d_df[feature2plot].std()

g = sns.relplot(
        data=features_3d_df,
        x="timepoint",
        y=feature2plot,
        col="flag2",
        col_wrap=6,          # adjust as needed
        kind="line",
        marker="o",
        linewidth=1.2,
        height=2.5,
        aspect=1.2,
        facet_kws={"sharey": True, "sharex": True},
        # ylim = (0, 1)
    )

g.set_axis_labels("Timepoint (tp)", f"{feature2plot.replace('_', ' ').title()} (blur {prec_mean_dataset})")
g.set_titles("{col_name}")
g.set(ylim=(0,1))

for ax in g.axes.flat:
    x_min = int(features_3d_df["timepoint"].min())
    x_max = int(features_3d_df["timepoint"].max())
    ax.set_xticks(range(x_min, x_max + 1))
    ax.set_xticklabels(range(1, number_of_steps_library + 1))
    ax.grid(False)
    ax.axhline(mean_feature, color = "red", alpha = 0.25, zorder = 0)
    ax.axhline(mean_feature+std_feature, color = "black", alpha = 0.25, zorder = 0, linestyle="--")
    ax.axhline(mean_feature-std_feature, color = "black", alpha = 0.25, zorder = 0, linestyle="--")


plt.tight_layout()
if save_plots:
    filename = f'lineplot_{feature2plot}_pertrace' + (f"_{file_suffix}" if file_suffix != '' else '') + '.pdf'
    plot_filename = Path(plots_folder, filename)
    print(f"Saving plots to: {plot_filename}")
    plt.savefig(plot_filename, **plot_opts)
    del filename, plot_filename
else:
    plt.show()
plt.close()
plt.close()

del feature2plot, mean_feature, std_feature, g, ax, x_min, x_max
../_images/bbf27421854101a72a669ebb314b76c1533137cc8f928e008657be860472442b.png

Distribution plot of compaction

Plot of the distribution for the compaction (bp/nm³) across the different timepoints and walks

Back to the section index

sns.displot(x=features_3d_df["compaction"], kind="kde", fill=True)
plt.xlabel(plot_labels["compaction"])
plt.ylabel("Density")
plt.title("Distribution of Compaction")
sns.despine()
if save_plots:
    filename = f'distribution_of_compaction' + (f"_{file_suffix}" if file_suffix != '' else '') + '.pdf'
    plot_filename = Path(plots_folder, filename)
    print(f"Saving plots to: {plot_filename}")
    plt.savefig(plot_filename, **plot_opts)
    del filename, plot_filename
else:
    plt.show()
plt.close()

print(f"Max compaction over all the data: {np.max(features_3d_df['compaction'])}")
print(f"Min compaction over all the data: {np.min(features_3d_df['compaction'])}")
print(f"Median compaction over all the data: {np.nanmedian(features_3d_df['compaction'])}")
print(f"Median absolute deviation of compaction over all the data: {median_abs_deviation(features_3d_df['compaction'])}")
../_images/b855acf7830ddf6f166574f95c82242a495ea005fa340284607e7363ee725c2c.png
Max compaction over all the data: 0.10175010175010175
Min compaction over all the data: 0.0029094294608827207
Median compaction over all the data: 0.010513170439172999
Median absolute deviation of compaction over all the data: 0.003969821687193798

Ratios of features between homologs

Here we retrieve those walks for which we have the pair of homologs, and compute the ratio of the features between them.
IMPORTANT!!! The ratio is always calculated as smaller homolog divided by bigger homolog.

Back to the section index

# TODO: to create a syntehtic dataset use Create_Random_Hom_Ratio_smalloverlarger

ratios2calculate = ['volume',
       'area', 'sphericity_bribiesca', 'sphericity', 'radius_of_gyration',
       'point_cloud_ellipticity', 'point_cloud_eccentricity',
       'mvee_ellipticity', 'mvee_eccentricity', 'solidity',
       'holeless_solidity', 'numerosity']

listratio=[]
both_homs_detected=0

for expID_nucID, group_rows in features_3d_df.groupby(["experimentID", "nucleusID"]): 
    
    if len(group_rows.homolog.unique())==2:
        
        both_homs_detected+=1
        homA_df = group_rows[group_rows.homolog==group_rows.homolog.unique()[0]]
        homB_df = group_rows[group_rows.homolog==group_rows.homolog.unique()[1]]
        
        for tp in group_rows.timepoint.sort_values().unique():
            homA_step=homA_df[homA_df.timepoint == tp]
            homB_step=homB_df[homB_df.timepoint == tp]
            
            if len(homA_step!=0) and len(homB_step!=0):
                homA_step = homA_step[ratios2calculate].copy()
                homA_step.reset_index(drop=True, inplace=True)
                homA_step = homA_step.astype(float)
                
                homB_step = homB_step[ratios2calculate].copy()
                homB_step.reset_index(drop=True, inplace=True)
                homB_step = homB_step.astype(float)

                temp = pd.DataFrame()

                for feature in ratios2calculate:
                    if homA_step[feature].values[0] > homB_step[feature].values[0]:
                        homA_val = homA_step[feature].values[0]
                        homB_val = homB_step[feature].values[0]
                        ratio = homB_val / homA_val if homA_val != 0 else np.nan
                    else:
                        homA_val = homA_step[feature].values[0]
                        homB_val = homB_step[feature].values[0]
                        ratio = homA_val / homB_val if homB_val != 0 else np.nan
                    temp[feature] = [ratio]
                
                temp = temp.replace([np.inf, -np.inf], np.nan)
            
                temp["experimentID"] = expID_nucID[0]
                temp["nucleusID"] = expID_nucID[1]
                temp["timepoint"] = tp
                temp["name"] = tp_names[tp]
                temp["flag"] = f"{expID_nucID[0]}_{expID_nucID[1]}_{tp}"

                listratio.append(temp)
            
print(f"Number of nuclei with both homologs detected: {both_homs_detected}")
if both_homs_detected > 0:           
    ratio_df = pd.concat(listratio, ignore_index=True)

    # Put experimentID, nucleusID, timepoint and flag columns as the first ones
    cols = ratio_df.columns.tolist()
    cols_reorder = [col for col in first_cols if col in cols] + [col for col in ratio_df.columns if col not in first_cols]
    ratio_df = ratio_df[cols_reorder]

    display(ratio_df.head())

del listratio, expID_nucID, group_rows
if both_homs_detected > 0:
    del homA_df, homB_df, tp, homA_step, homB_step, temp, cols, cols_reorder
Number of nuclei with both homologs detected: 7
experimentID nucleusID timepoint name flag volume area sphericity_bribiesca sphericity radius_of_gyration point_cloud_ellipticity point_cloud_eccentricity mvee_ellipticity mvee_eccentricity solidity holeless_solidity numerosity
0 2018-06-28 01 6 Step2 2018-06-28_01_6 0.768706 0.688118 0.972842 0.820017 0.793591 0.742057 0.521469 0.914879 0.969637 0.771493 0.771493 0.853881
1 2018-06-28 01 7 Step3 2018-06-28_01_7 0.484196 0.561475 0.984803 0.910576 0.686866 0.576570 0.651632 0.570376 0.737631 0.909381 0.909381 0.515002
2 2018-06-28 01 8 Step4 2018-06-28_01_8 0.816408 0.871688 0.993748 0.997906 0.941998 0.923149 0.267049 0.857644 0.602944 0.831735 0.831735 0.737461
3 2018-06-28 01 9 Step5 2018-06-28_01_9 0.908607 0.990657 0.982062 0.929339 0.855609 0.914165 0.696192 0.745733 0.925636 0.614877 0.614877 0.596154
4 2018-06-28 01 10 Step6 2018-06-28_01_10 0.968750 0.945935 0.994009 0.966170 0.999172 0.517633 0.865706 0.940067 0.856221 0.846370 0.846370 0.895298

Barplots for the different features across the different timepoints

Barplots of the ratios between homologs, seaparated bt chromatin segment

if both_homs_detected > 0:
    for ratio_name in ratios2calculate:

        temp = ratio_df[["timepoint", ratio_name, "name"]].copy()

        temp["name"] = pd.Categorical(temp["name"], categories=info_df[info_file_dict["column_name"]].tolist(), ordered=True)
        temp.sort_values("name", inplace=True)

        sns.boxplot(data=temp, x="timepoint", y=ratio_name, hue="timepoint", palette="tab20", legend=False)
        sns.stripplot(data=temp, x="timepoint", y=ratio_name, color="gray", dodge=True, alpha=0.5)
        plt.xticks(ticks = range(len(temp["name"].unique())), labels=temp["name"].unique(),
                rotation=45, ha="right")
        plt.xlabel("")
        plt.ylabel(f"Ratio of {' '.join(ratio_name.split('_'))} (small homolog / big homolog)")
        sns.despine()
        plt.title(f"Ratio of {' '.join(ratio_name.split('_'))} over timepoints")
        if save_plots:
            filename = f'boxplot_ratio_{ratio_name.replace(" ", "_")}_homologs' + (f"_{file_suffix}" if file_suffix != '' else '') + '.pdf'
            plot_filename = Path(plots_folder, filename)
            print(f"Saving plots to: {plot_filename}")
            plt.savefig(plot_filename, **plot_opts)
            del filename, plot_filename
        else:
            plt.show()
        plt.close()
        
    # del ratio_name, ratios2calculate
# del both_homs_detected, ratio_df
../_images/846b01c872a777f7d0a3634c336ad20833978d3ec03f0fb8f6b643503ddd6cd3.png ../_images/e34bca153168d3b2dbcf81fc7a69f14f5a2a6cda4fdd2602b20eab1879c9907a.png ../_images/266bd5f47e2211d3bc266b1eb768d9b2ed74cd407a716bed48134d745f391b68.png ../_images/bc8c2cc83da9d462cbd10db9dad563323e2a9752305f37bd500efbc2e6ebe0f6.png ../_images/5c8d43471b99a76618cf69db5230bddacdea76e106773a86706c32cd7c1b0a8b.png ../_images/68fe78ef5216228bc168e71b76896124e6b6356a0d3d65e8ab8c8a313807dd72.png ../_images/43f89ea5f330ea51bbac3f55fea670ebdb4cffd4857ab48a0dd4166be448085c.png ../_images/4b4ee8fea8edb485585643f8ade48e53baff11c1817a5b73eaa34209708a6df2.png ../_images/60200826aaeed977da24f12734b7a8ea0954fc54e003215cf8a99743728610e4.png ../_images/ee5461778a64d70fb2e1308b5146014c7492624ad8eab236e29d7c0a4fa70345.png ../_images/85f9d7042aaf80249dd5a24b183a158064c8862c0b891073b12c238ae8a96314.png ../_images/f1e7b01803495d350c5e037d10d7ec637b6944bea8d95e547b0988833fdcb520.png