Menu

Package that provides tools for brain MRI Deep Learning pre-processing.

Source code for brainprep.qc

# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2021 - 2022
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################

"""
Usefull automatic quality control (QC) functions.
"""

# Imports
import os
import re
import traceback
import numpy as np
import pandas as pd
from pprint import pprint
import xml.etree.ElementTree as ET
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from .utils import get_bids_keys


[docs]def check_files(input_files): """ Check if all data are ordered the same way and follows the BIDS nomenclature. Parameters ---------- input_files: list of list """ sizes = [len(item) for item in input_files] if len(np.unique(sizes)) != 1: pprint(input_files) raise ValueError("Input list of files must have the same number of " "elements.") for item in zip(*input_files): keys = [get_bids_keys(path) for path in item] keys = ["{participant_id}_{session}_{run}".format(**item) for item in keys] if len(np.unique(keys)) != 1: raise ValueError( "Input list of files are not ordered the same way.")
[docs]def plot_pca(X, df_description, outdir): """ Save the two first PCA components. Parameters ---------- X: array (n_samples, ...) the input data. df_description: pandas DataFrame samples associated descriptons: must have 'n_samples' rows and a 'participant_id' column. outdir: str the destination folder. Returns ------- pca_path: str the path to the generated file. """ if len(X) != len(df_description): raise ValueError("'X' and 'df_description' must have the same length.") if "participant_id" not in df_description.columns: raise ValueError("'df_description' must contains a 'participant_id' " "column.") X = X.reshape(len(X), -1) X[np.isnan(X)] = 0 pca = PCA(n_components=2) components = pca.fit_transform(X) fig, ax = plt.subplots(figsize=(20, 30)) ax.scatter(components[:, 0], components[:, 1]) for idx, desc in enumerate(df_description["participant_id"]): ax.annotate(desc, xy=(components[idx, 0], components[idx, 1]), xytext=(4, 4), textcoords="offset pixels") plt.xlabel("PC1 (var=%.2f)" % pca.explained_variance_ratio_[0]) plt.ylabel("PC2 (var=%.2f)" % pca.explained_variance_ratio_[1]) plt.axis("equal") ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) plt.tight_layout() pca_path = os.path.join(outdir, "pca.pdf") plt.savefig(pca_path) return pca_path
[docs]def compute_mean_correlation(X, df_description, outdir): """ Compute mean correlation. Parameters ---------- X: array (n_samples, ...) the input data. df_description: pandas DataFrame samples associated descriptons: must have 'n_samples' rows and 'participant_id', 'session', 'run' and 'ni_path' columns. outdir: str the destination folder. Returns ------- df_corr: pandas DataFrame sorted input data description based on mean correlation: columns are 'participant_id', 'session', 'run', 'corr_mean'. heatmap_path: str path to the heatmap of mean correlation. """ # Checks if len(X) != len(df_description): raise ValueError("'X' and 'df_description' must have the same length.") for key in ("participant_id", "ni_path", "session", "run"): if key not in df_description.columns: raise ValueError( "'df_description' must contains a '{}' column.".format(key)) # Compute the correlation matrix X = X.reshape(len(X), -1) X[np.isnan(X)] = 0 X[np.isinf(X)] = 0 corr = np.corrcoef(X, dtype=np.single) # Compute the Z-transformation of the correlation den = 1. - corr den[den == 0] = 1e-8 zcorr = 0.5 * np.log((1. + corr) / den) zcorr[np.isnan(zcorr)] = 0 zcorr[np.isinf(zcorr)] = 0 zcorr_mean = (zcorr.sum(axis=1) - 1) / (len(zcorr) - 1) # Get the index sorted by descending Z-corrected mean correlation values sort_idx = np.argsort(zcorr_mean) participant_ids = df_description["participant_id"][sort_idx] sessions_ids = df_description["session"][sort_idx] run_ids = df_description["run"][sort_idx] corr_reorder = corr[np.ix_(sort_idx, sort_idx)] # Plot heatmap of mean correlation plt.subplots(figsize=(10, 10)) cmap = sns.color_palette("RdBu_r", 110) sns.heatmap(corr_reorder, mask=None, cmap=cmap, vmin=-1, vmax=1, center=0) corr_path = os.path.join(outdir, "correlation.png") plt.savefig(corr_path) # Generate data frame with results df_corr = pd.DataFrame(dict(participant_id=participant_ids, session=sessions_ids, run=run_ids, corr_mean=zcorr_mean[sort_idx])) df_corr = df_corr.reindex( ["participant_id", "session", "run", "corr_mean"], axis="columns") return df_corr, corr_path
[docs]def parse_fsreconall_stats(fs_dirs): """ Parse the FreeSurfer reconall generated quality control files for all subjects. Parameters ---------- fs_dirs: list of str list of FreeSurfer recon-all generated directories. Returns ------- df_scores: pandas DataFrame the FreeSurfer recon-all scores organized by 'participant_id', 'session', 'run', 'euler'. """ scores = {} for path in fs_dirs: keys = get_bids_keys(path) participant_id = keys["participant_id"] session = keys["session"] run = keys["run"] logfile = os.path.join(path, "scripts", "recon-all.log") with open(logfile, "rt") as of: lines = of.readlines() selection = [item for item in lines if item.startswith("orig.nofix lheno")] assert len(selection) == 1, selection _, left_euler, right_euler = selection[0].split("=") left_euler, _ = left_euler.split(",") left_euler = int(left_euler.strip()) right_euler = int(right_euler.strip()) euler = (left_euler + right_euler) * 0.5 scores.setdefault("participant_id", []).append(participant_id) scores.setdefault("session", []).append(session) scores.setdefault("run", []).append(run) scores.setdefault("euler", []).append(euler) df_scores = pd.DataFrame.from_dict(scores) return df_scores
[docs]def parse_cat12vbm_roi(xml_filenames, output_file): """ Parse the cat12vbm xml generated rois files for all subjects. Parameters ---------- xml_filenames: list or str(regex,regex) regex to the CAT12 VBM catROI and cat xml files for all subjects: `<PATH>/label/catROI_sub-*_ses-*_T1w.xml`, `<PATH>/report/cat_sub-*_ses-*_T1w.xml`. output: str the destination folder. Returns ------- output_file: str rois tsv path. """ roi_names = None cohort_globvol = pd.DataFrame() cohort_roivol = pd.DataFrame() for xml_file in xml_filenames: df_sub_key = pd.DataFrame() xml_file_keys = get_bids_keys(xml_file) participant_id = "sub-"+xml_file_keys['participant_id'] session = xml_file_keys['session'] or '1' run = xml_file_keys['run'] or '1' df_sub_key["participant_id"] = [participant_id] df_sub_key["session"] = [session] df_sub_key["run"] = [run] if re.match('.*report/cat_.*\.xml', xml_file): cat = pd.read_xml(xml_file) try: tiv = cat['vol_TIV'][7] vol_abs_cgw = cat['vol_abs_CGW'][7][1:-1].split() vol_abs_cgw = [float(volume) for volume in vol_abs_cgw] except Exception as e: print('Parsing error for %s:\n%s' % (xml_file, traceback.format_exc())) else: globvolume_dico_sub = {} globvolume_dico_sub['tiv'] = float(tiv) globvolume_dico_sub['CSF_Vol'] = vol_abs_cgw[0] globvolume_dico_sub['GM_Vol'] = vol_abs_cgw[1] globvolume_dico_sub['WM_Vol'] = vol_abs_cgw[2] df_global_sub = pd.DataFrame(globvolume_dico_sub, index=[0]) concat_globvol = [df_sub_key, df_global_sub] sub_globvol = pd.concat(concat_globvol, axis=1) cohort_globvol = pd.concat([cohort_globvol, sub_globvol], axis=0) elif re.match('.*label/catROI_.*\.xml', xml_file): tree = ET.parse(xml_file) try: iterparse = {"neuromorphometrics": ["ids", "Vgm", "Vcsf"]} catroi = pd.read_xml(xml_file, iterparse=iterparse) _roi_names = [item.text for item in tree.find('neuromorphometrics') .find('names').findall('item')] if roi_names is None: roi_names = _roi_names assert set(roi_names) == set(_roi_names), xml_file v_gm = catroi['Vgm'].str.replace("\[|\]", "", regex=True)\ .str.split(";")[0] v_gm = [float(volume) for volume in v_gm] v_csf = catroi['Vcsf'].str.replace("\[|\]", "", regex=True)\ .str.split(";")[0] v_csf = [float(volume) for volume in v_csf] assert len(roi_names) == len(v_gm) == len(v_csf) except Exception as e: print('Parsing error for %s: \n%s' % (xml_file, traceback.format_exc())) else: rois_sub = {} gm_rois_names = [rois_name+'_GM_Vol' for rois_name in roi_names] csf_rois_names = [rois_name+'_CSF_Vol' for rois_name in roi_names] for idx, gmroiname in enumerate(gm_rois_names): rois_sub[gmroiname] = v_gm[idx] rois_sub[csf_rois_names[idx]] = v_csf[idx] df_rois_sub = pd.DataFrame(rois_sub, index=[0]) concat_roivol = [df_sub_key, df_rois_sub] sub_roivol = pd.concat(concat_roivol, axis=1) cohort_roivol = pd.concat([cohort_roivol, sub_roivol], axis=0) roi_names = roi_names or [] cohort_volumes = cohort_globvol.merge(cohort_roivol, how='outer', on=['participant_id', 'session', 'run']) cohort_volumes.to_csv(output_file, sep="\t", float_format=str, index=False) return output_file
[docs]def parse_cat12vbm_qc(qc_files): """ Parse the CAT12 VBM generated quality control files for all subjects. Parameters ---------- qc_files: list of str list of CAT12 VBM generated quality control xml files. Returns ------- df_scores: pandas DataFrame the CAT12 VBM scores organized by 'participant_id', 'session', 'run', 'NCR', 'ICR', 'IQR'. """ scores = {} for xml_file in qc_files: keys = get_bids_keys(xml_file) participant_id = keys["participant_id"] session = keys["session"] run = keys["run"] if re.match(".*report/cat_.*\.xml", xml_file): tree = ET.parse(xml_file) try: ncr = float(tree.find("qualityratings").find("NCR").text) icr = float(tree.find("qualityratings").find("ICR").text) iqr = float(tree.find("qualityratings").find("IQR").text) except Exception as e: print(e) trace = traceback.format_exc() print("Parsing error for {}:\n{}".format(xml_file, trace)) ncr, icr, iqr = (np.nan, np.nan, np.nan) scores.setdefault("participant_id", []).append(participant_id) scores.setdefault("session", []).append(session) scores.setdefault("run", []).append(run) scores.setdefault("NCR", []).append(ncr) scores.setdefault("ICR", []).append(icr) scores.setdefault("IQR", []).append(iqr) df_scores = pd.DataFrame.from_dict(scores) return df_scores
[docs]def parse_cat12vbm_report(img_files, cat12vbm_root): """ Parse the CAT12 VBM report files for all subjects. Parameters ---------- img_files: list of str path to images. cat12vbm_root: str the root path of the CAT12VBM preprocessing folder. Returns ------- reports: list of str the associated CAT12 VBM reports. """ reports = [] for path in img_files: keys = get_bids_keys(path) participant_id = keys["participant_id"] session = keys["session"] name = os.path.basename(path)[4:] if name.endswith(".nii.gz"): name = name.replace(".nii.gz", ".pdf") elif name.endswith(".nii"): name = name.replace(".nii", ".pdf") else: raise ValueError("Unexpected file extension: {}.".format(path)) rpath = [ os.path.join( "sub-{}".format(participant_id), "ses-{}".format(session), "anat", "report", "catreport_{}".format(name)), os.path.join( "sub-{}".format(participant_id), "anat", "report", "catreport_{}".format(name)), os.path.join( "sub-{}".format(participant_id), "ses-{}".format(session), "anat", "report", "catreport_r{}".format(name)), None ] for _rpath in rpath: if _rpath is None: reports.append("") break _path = os.path.join(cat12vbm_root, _rpath) if os.path.isfile(_path): reports.append(_path) break return reports

Follow us

© 2023, brainprep developers