Package that provides tools for brain MRI Deep Learning pre-processing.
Source code for brainprep.qc
# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2021 - 2022
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################
"""
Usefull automatic quality control (QC) functions.
"""
# Imports
import os
import re
import traceback
import numpy as np
import pandas as pd
from pprint import pprint
import xml.etree.ElementTree as ET
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from .utils import get_bids_keys
[docs]def check_files(input_files):
""" Check if all data are ordered the same way and follows the BIDS
nomenclature.
Parameters
----------
input_files: list of list
"""
sizes = [len(item) for item in input_files]
if len(np.unique(sizes)) != 1:
pprint(input_files)
raise ValueError("Input list of files must have the same number of "
"elements.")
for item in zip(*input_files):
keys = [get_bids_keys(path) for path in item]
keys = ["{participant_id}_{session}_{run}".format(**item)
for item in keys]
if len(np.unique(keys)) != 1:
raise ValueError(
"Input list of files are not ordered the same way.")
[docs]def plot_pca(X, df_description, outdir):
""" Save the two first PCA components.
Parameters
----------
X: array (n_samples, ...)
the input data.
df_description: pandas DataFrame
samples associated descriptons: must have 'n_samples' rows and a
'participant_id' column.
outdir: str
the destination folder.
Returns
-------
pca_path: str
the path to the generated file.
"""
if len(X) != len(df_description):
raise ValueError("'X' and 'df_description' must have the same length.")
if "participant_id" not in df_description.columns:
raise ValueError("'df_description' must contains a 'participant_id' "
"column.")
X = X.reshape(len(X), -1)
X[np.isnan(X)] = 0
pca = PCA(n_components=2)
components = pca.fit_transform(X)
fig, ax = plt.subplots(figsize=(20, 30))
ax.scatter(components[:, 0], components[:, 1])
for idx, desc in enumerate(df_description["participant_id"]):
ax.annotate(desc, xy=(components[idx, 0], components[idx, 1]),
xytext=(4, 4), textcoords="offset pixels")
plt.xlabel("PC1 (var=%.2f)" % pca.explained_variance_ratio_[0])
plt.ylabel("PC2 (var=%.2f)" % pca.explained_variance_ratio_[1])
plt.axis("equal")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.tight_layout()
pca_path = os.path.join(outdir, "pca.pdf")
plt.savefig(pca_path)
return pca_path
[docs]def compute_mean_correlation(X, df_description, outdir):
""" Compute mean correlation.
Parameters
----------
X: array (n_samples, ...)
the input data.
df_description: pandas DataFrame
samples associated descriptons: must have 'n_samples' rows and
'participant_id', 'session', 'run' and 'ni_path' columns.
outdir: str
the destination folder.
Returns
-------
df_corr: pandas DataFrame
sorted input data description based on mean correlation: columns are
'participant_id', 'session', 'run', 'corr_mean'.
heatmap_path: str
path to the heatmap of mean correlation.
"""
# Checks
if len(X) != len(df_description):
raise ValueError("'X' and 'df_description' must have the same length.")
for key in ("participant_id", "ni_path", "session", "run"):
if key not in df_description.columns:
raise ValueError(
"'df_description' must contains a '{}' column.".format(key))
# Compute the correlation matrix
X = X.reshape(len(X), -1)
X[np.isnan(X)] = 0
X[np.isinf(X)] = 0
corr = np.corrcoef(X, dtype=np.single)
# Compute the Z-transformation of the correlation
den = 1. - corr
den[den == 0] = 1e-8
zcorr = 0.5 * np.log((1. + corr) / den)
zcorr[np.isnan(zcorr)] = 0
zcorr[np.isinf(zcorr)] = 0
zcorr_mean = (zcorr.sum(axis=1) - 1) / (len(zcorr) - 1)
# Get the index sorted by descending Z-corrected mean correlation values
sort_idx = np.argsort(zcorr_mean)
participant_ids = df_description["participant_id"][sort_idx]
sessions_ids = df_description["session"][sort_idx]
run_ids = df_description["run"][sort_idx]
corr_reorder = corr[np.ix_(sort_idx, sort_idx)]
# Plot heatmap of mean correlation
plt.subplots(figsize=(10, 10))
cmap = sns.color_palette("RdBu_r", 110)
sns.heatmap(corr_reorder, mask=None, cmap=cmap, vmin=-1, vmax=1, center=0)
corr_path = os.path.join(outdir, "correlation.png")
plt.savefig(corr_path)
# Generate data frame with results
df_corr = pd.DataFrame(dict(participant_id=participant_ids,
session=sessions_ids,
run=run_ids,
corr_mean=zcorr_mean[sort_idx]))
df_corr = df_corr.reindex(
["participant_id", "session", "run", "corr_mean"], axis="columns")
return df_corr, corr_path
[docs]def parse_fsreconall_stats(fs_dirs):
""" Parse the FreeSurfer reconall generated quality control files for all
subjects.
Parameters
----------
fs_dirs: list of str
list of FreeSurfer recon-all generated directories.
Returns
-------
df_scores: pandas DataFrame
the FreeSurfer recon-all scores organized by 'participant_id',
'session', 'run', 'euler'.
"""
scores = {}
for path in fs_dirs:
keys = get_bids_keys(path)
participant_id = keys["participant_id"]
session = keys["session"]
run = keys["run"]
logfile = os.path.join(path, "scripts", "recon-all.log")
with open(logfile, "rt") as of:
lines = of.readlines()
selection = [item for item in lines
if item.startswith("orig.nofix lheno")]
assert len(selection) == 1, selection
_, left_euler, right_euler = selection[0].split("=")
left_euler, _ = left_euler.split(",")
left_euler = int(left_euler.strip())
right_euler = int(right_euler.strip())
euler = (left_euler + right_euler) * 0.5
scores.setdefault("participant_id", []).append(participant_id)
scores.setdefault("session", []).append(session)
scores.setdefault("run", []).append(run)
scores.setdefault("euler", []).append(euler)
df_scores = pd.DataFrame.from_dict(scores)
return df_scores
[docs]def parse_cat12vbm_roi(xml_filenames, output_file):
""" Parse the cat12vbm xml generated rois files for all
subjects.
Parameters
----------
xml_filenames: list or str(regex,regex)
regex to the CAT12 VBM catROI and cat xml files for all subjects:
`<PATH>/label/catROI_sub-*_ses-*_T1w.xml`,
`<PATH>/report/cat_sub-*_ses-*_T1w.xml`.
output: str
the destination folder.
Returns
-------
output_file: str
rois tsv path.
"""
roi_names = None
cohort_globvol = pd.DataFrame()
cohort_roivol = pd.DataFrame()
for xml_file in xml_filenames:
df_sub_key = pd.DataFrame()
xml_file_keys = get_bids_keys(xml_file)
participant_id = "sub-"+xml_file_keys['participant_id']
session = xml_file_keys['session'] or '1'
run = xml_file_keys['run'] or '1'
df_sub_key["participant_id"] = [participant_id]
df_sub_key["session"] = [session]
df_sub_key["run"] = [run]
if re.match('.*report/cat_.*\.xml', xml_file):
cat = pd.read_xml(xml_file)
try:
tiv = cat['vol_TIV'][7]
vol_abs_cgw = cat['vol_abs_CGW'][7][1:-1].split()
vol_abs_cgw = [float(volume) for volume in vol_abs_cgw]
except Exception as e:
print('Parsing error for %s:\n%s' %
(xml_file, traceback.format_exc()))
else:
globvolume_dico_sub = {}
globvolume_dico_sub['tiv'] = float(tiv)
globvolume_dico_sub['CSF_Vol'] = vol_abs_cgw[0]
globvolume_dico_sub['GM_Vol'] = vol_abs_cgw[1]
globvolume_dico_sub['WM_Vol'] = vol_abs_cgw[2]
df_global_sub = pd.DataFrame(globvolume_dico_sub, index=[0])
concat_globvol = [df_sub_key, df_global_sub]
sub_globvol = pd.concat(concat_globvol, axis=1)
cohort_globvol = pd.concat([cohort_globvol, sub_globvol], axis=0)
elif re.match('.*label/catROI_.*\.xml', xml_file):
tree = ET.parse(xml_file)
try:
iterparse = {"neuromorphometrics": ["ids", "Vgm", "Vcsf"]}
catroi = pd.read_xml(xml_file, iterparse=iterparse)
_roi_names = [item.text for item in
tree.find('neuromorphometrics')
.find('names').findall('item')]
if roi_names is None:
roi_names = _roi_names
assert set(roi_names) == set(_roi_names), xml_file
v_gm = catroi['Vgm'].str.replace("\[|\]", "", regex=True)\
.str.split(";")[0]
v_gm = [float(volume) for volume in v_gm]
v_csf = catroi['Vcsf'].str.replace("\[|\]", "", regex=True)\
.str.split(";")[0]
v_csf = [float(volume) for volume in v_csf]
assert len(roi_names) == len(v_gm) == len(v_csf)
except Exception as e:
print('Parsing error for %s: \n%s' %
(xml_file, traceback.format_exc()))
else:
rois_sub = {}
gm_rois_names = [rois_name+'_GM_Vol' for rois_name
in roi_names]
csf_rois_names = [rois_name+'_CSF_Vol' for rois_name
in roi_names]
for idx, gmroiname in enumerate(gm_rois_names):
rois_sub[gmroiname] = v_gm[idx]
rois_sub[csf_rois_names[idx]] = v_csf[idx]
df_rois_sub = pd.DataFrame(rois_sub, index=[0])
concat_roivol = [df_sub_key, df_rois_sub]
sub_roivol = pd.concat(concat_roivol, axis=1)
cohort_roivol = pd.concat([cohort_roivol, sub_roivol], axis=0)
roi_names = roi_names or []
cohort_volumes = cohort_globvol.merge(cohort_roivol, how='outer',
on=['participant_id', 'session',
'run'])
cohort_volumes.to_csv(output_file, sep="\t", float_format=str, index=False)
return output_file
[docs]def parse_cat12vbm_qc(qc_files):
""" Parse the CAT12 VBM generated quality control files for all
subjects.
Parameters
----------
qc_files: list of str
list of CAT12 VBM generated quality control xml files.
Returns
-------
df_scores: pandas DataFrame
the CAT12 VBM scores organized by 'participant_id', 'session', 'run',
'NCR', 'ICR', 'IQR'.
"""
scores = {}
for xml_file in qc_files:
keys = get_bids_keys(xml_file)
participant_id = keys["participant_id"]
session = keys["session"]
run = keys["run"]
if re.match(".*report/cat_.*\.xml", xml_file):
tree = ET.parse(xml_file)
try:
ncr = float(tree.find("qualityratings").find("NCR").text)
icr = float(tree.find("qualityratings").find("ICR").text)
iqr = float(tree.find("qualityratings").find("IQR").text)
except Exception as e:
print(e)
trace = traceback.format_exc()
print("Parsing error for {}:\n{}".format(xml_file, trace))
ncr, icr, iqr = (np.nan, np.nan, np.nan)
scores.setdefault("participant_id", []).append(participant_id)
scores.setdefault("session", []).append(session)
scores.setdefault("run", []).append(run)
scores.setdefault("NCR", []).append(ncr)
scores.setdefault("ICR", []).append(icr)
scores.setdefault("IQR", []).append(iqr)
df_scores = pd.DataFrame.from_dict(scores)
return df_scores
[docs]def parse_cat12vbm_report(img_files, cat12vbm_root):
""" Parse the CAT12 VBM report files for all subjects.
Parameters
----------
img_files: list of str
path to images.
cat12vbm_root: str
the root path of the CAT12VBM preprocessing folder.
Returns
-------
reports: list of str
the associated CAT12 VBM reports.
"""
reports = []
for path in img_files:
keys = get_bids_keys(path)
participant_id = keys["participant_id"]
session = keys["session"]
name = os.path.basename(path)[4:]
if name.endswith(".nii.gz"):
name = name.replace(".nii.gz", ".pdf")
elif name.endswith(".nii"):
name = name.replace(".nii", ".pdf")
else:
raise ValueError("Unexpected file extension: {}.".format(path))
rpath = [
os.path.join(
"sub-{}".format(participant_id), "ses-{}".format(session),
"anat", "report", "catreport_{}".format(name)),
os.path.join(
"sub-{}".format(participant_id), "anat", "report",
"catreport_{}".format(name)),
os.path.join(
"sub-{}".format(participant_id), "ses-{}".format(session),
"anat", "report", "catreport_r{}".format(name)),
None
]
for _rpath in rpath:
if _rpath is None:
reports.append("")
break
_path = os.path.join(cat12vbm_root, _rpath)
if os.path.isfile(_path):
reports.append(_path)
break
return reports
Follow us