Source code for ramutils.reports.summary

from __future__ import division
from __future__ import unicode_literals

from datetime import datetime

import json
import numpy as np
import pandas as pd
import pytz
import pickle
import base64
from collections import OrderedDict
import io

from ramutils.utils import safe_divide
from ramutils.events import extract_subject, extract_experiment_from_events, \
    extract_sessions
from ramutils.bayesian_optimization import choose_location
from ramutils.exc import TooManySessionsError
from ramutils.parameters import ExperimentParameters
from ramutils.powers import save_power_plot, save_eeg_by_channel_plot
from ramutils.utils import encode_file
from ramutils.montage import (generate_pairs_for_classifier, get_distances,
                              get_used_pair_mask)
from ramutils.thetamod import tmi
from traitschema import Schema
from traits.api import Array, ArrayOrNone, Float, Unicode, Bool, Bytes, CArray


from sklearn.metrics import roc_auc_score, roc_curve
from statsmodels.stats.proportion import proportions_chisquare


__all__ = [
    'Summary',
    'ClassifierSummary',
    'SessionSummary',
    'StimSessionSummary',
    'FRSessionSummary',
    'CatFRSessionSummary',
    'FRStimSessionSummary',
    'FR5SessionSummary',
    'TICLFRSessionSummary',
    'PSSessionSummary',
    'LocationSearchSessionSummary',
    'MathSummary'
]


[docs]class ClassifierSummary(Schema): """ Classifier Summary Object """ _predicted_probabilities = ArrayOrNone( desc='predicted recall probabilities') _true_outcomes = ArrayOrNone( desc='actual results for recall vs. non-recall') _permuted_auc_values = ArrayOrNone(desc='permuted AUCs') _frequencies = ArrayOrNone( desc='Frequencies the classifier was trained on') _pairs = ArrayOrNone(desc='Bipolar pairs used to train the classifier') _features = ArrayOrNone(desc='Feature matrix used to train the classifier') _coef = ArrayOrNone(desc = 'Classifier coefficients') subject = Unicode(desc='subject') experiment = Unicode(desc='experiment') sessions = Array(desc='sessions summarized by the object') recall_rate = Float(desc='overall recall rate') tag = Unicode(desc='name of the classifier') reloaded = Bool(desc='classifier was reloaded from hard disk') low_terc_recall_rate = Float( desc='recall rate when predicted probability of recall was in lowest tercile') mid_terc_recall_rate = Float( desc='recall reate when predicted probability of recall was in middle tercile') high_terc_recall_rate = Float( desc='recall rate when predicted probability of recall was in highest tercile') @property def id(self): session_str = ".".join([str(sess) for sess in np.unique(self.sessions)]) return ":".join([self.subject, self.experiment, session_str]) @property def predicted_probabilities(self): """ Classifier output for each word encoding event """ return self._predicted_probabilities @predicted_probabilities.setter def predicted_probabilities(self, new_predicted_probabilities): if self._predicted_probabilities is None: self._predicted_probabilities = new_predicted_probabilities @property def true_outcomes(self): """ Behavioral response (recalled/not-recalled) to each word encoding event""" return self._true_outcomes @true_outcomes.setter def true_outcomes(self, new_true_outcomes): if self._true_outcomes is None: self._true_outcomes = new_true_outcomes @property def permuted_auc_values(self): """ Array of AUC values from performing permutation test """ return self._permuted_auc_values @permuted_auc_values.setter def permuted_auc_values(self, new_permuted_auc_values): if self._permuted_auc_values is None: self._permuted_auc_values = new_permuted_auc_values @property def auc(self): """ Classifier AUC """ auc = roc_auc_score(self.true_outcomes, self.predicted_probabilities) return auc @property def pvalue(self): """ p-value of classifier AUC based on permuted AUCs """ pvalue = np.count_nonzero( (self.permuted_auc_values >= self.auc)) / float(len(self.permuted_auc_values)) return pvalue @property def false_positive_rate(self): """ False positive rate used for AUC curve """ fpr, _, _ = roc_curve(self.true_outcomes, self.predicted_probabilities) fpr = fpr.tolist() return fpr @property def true_positive_rate(self): """ True positive rate used for AUC curve""" _, tpr, _ = roc_curve(self.true_outcomes, self.predicted_probabilities) tpr = tpr.tolist() return tpr @property def thresholds(self): """ Thresholds used for AUC curve """ _, _, thresholds = roc_curve( self.true_outcomes, self.predicted_probabilities) thresholds = thresholds.tolist() return thresholds @property def median_classifier_output(self): """ Median of the classifier outputs """ return np.median(self.predicted_probabilities) @property def confidence_interval_median_classifier_output(self): """ 95% confidence interval for the median of the classifier output. Used as a sniff test for if something is amiss. Should be centered around 0.5 """ sorted_probs = sorted(self.predicted_probabilities) n = len(self.predicted_probabilities) low_idx = int(round((n / 2.0) - ((1.96 * n**.5) / 2.0))) high_idx = int(round(1 + (n / 2.0) + ((1.96 * n**.5) / 2.0))) low_val = sorted_probs[low_idx] high_val = sorted_probs[high_idx] return low_val, high_val @property def low_tercile_diff_from_mean(self): """ % change in recall rate from overall recall when classifier output was in lowest tercile """ return 100.0 * (self.low_terc_recall_rate - self.recall_rate) / self.recall_rate @property def mid_tercile_diff_from_mean(self): """ % change in recall rate from overall recall when classifier output was in middle tercile """ return 100.0 * (self.mid_terc_recall_rate - self.recall_rate) / self.recall_rate @property def high_tercile_diff_from_mean(self): """ % change in recall rate from overall recall when classifier output was in highest tercile """ return 100.0 * (self.high_terc_recall_rate - self.recall_rate) / self.recall_rate @property def features(self): return self._features if self._features is not None else np.array([]) @property def pairs(self): return self._pairs if self._pairs is not None else np.array([]) @property def frequencies(self): return self._frequencies if self._frequencies is not None else np.array([]) @property def classifier_activation(self): """ Forward model of classifier activation from Haufe et. al. 2014 """ if self._features is None: return np.array([]) return np.dot(np.cov(self._features, rowvar=False), self._coef.squeeze()) @property def classifier_activation_2d(self): return self.classifier_activation.reshape( len(self.pairs), len(self.frequencies) ) @property def classifier_activation_by_region(self): if len(self.classifier_activation): activation_df = pd.DataFrame(data=self.classifier_activation_2d, index=self.pairs['region']) mean_activation = activation_df.groupby(activation_df.index).mean() return mean_activation.values.T else: return np.array([]) @property def regions(self): """ List of unique electrode regions """ if len(self.pairs): return [str(x) for x in np.unique(self.pairs['region'])] else: return []
[docs] def populate(self, subject, experiment, session, true_outcomes, predicted_probabilities, permuted_auc_values, frequencies, pairs, features, coefficients, tag='', reloaded=False): """ Populate classifier performance metrics Parameters ---------- subject: string Subject identifier experiment: string Name of the experiment session: string Session number true_outcomes: array_like Boolean array for if a word was recalled or not predicted_probabilities: array_like Outputs from the trained classifier for each word event permuted_auc_values: array_like AUC values from performing a permutation test on classifier frequencies: array_like Frequencies used to train the classifier pairs: pd.DataFrame Metadata for each bipolar pair recorded from features: np.ndarray Feature matrix used to train the classifier, of shape [len(predicted_probabilities) , (len(pairs) * len(frequencies)]. coefficients : np.array Array of classifier weights tag: str Name given to the classifier, used to differentiate between multiple classifiers reloaded: bool Indicates whether the classifier is reloaded from hard disk, i.e. is the actually classifier used. If false, then the classifier was created from scratch """ self.subject = subject self.experiment = experiment self.sessions = session self.true_outcomes = true_outcomes self.predicted_probabilities = predicted_probabilities self.permuted_auc_values = permuted_auc_values self.tag = tag self.reloaded = reloaded self._frequencies = frequencies self._pairs = pairs self._features = features self._coef = coefficients thresh_low = np.percentile(predicted_probabilities, 100.0 / 3.0) thresh_high = np.percentile(predicted_probabilities, 2.0 * 100.0 / 3.0) low_tercile_mask = (predicted_probabilities <= thresh_low) high_tercile_mask = (predicted_probabilities >= thresh_high) mid_tercile_mask = ~(low_tercile_mask | high_tercile_mask) self.low_terc_recall_rate = np.sum(true_outcomes[low_tercile_mask]) / float(np.sum( low_tercile_mask)) self.mid_terc_recall_rate = np.sum(true_outcomes[mid_tercile_mask]) / float(np.sum( mid_tercile_mask)) self.high_terc_recall_rate = np.sum(true_outcomes[high_tercile_mask]) / float( np.sum(high_tercile_mask)) self.recall_rate = np.sum(true_outcomes) / float(true_outcomes.size) return
[docs]class MathSummary(Schema): """Summarizes data from math distractor periods. Input events must either be all events (which include math events) or just math events. """ _events = ArrayOrNone(desc='Math distractor task events')
[docs] def populate(self, events): """ Populate the summary object with the given events """ self.events = events
[docs] def to_dataframe(self, recreate=False): """Convert the summary to a :class:`pd.DataFrame` for easier manipulation. This amounts to converting the events to a dataframe Keyword arguments ----------------- recreate : bool Force re-creating the dataframe. Otherwise, it will only be created the first time this method is called and stored as an instance attribute. Returns ------- pd.DataFrame """ if not hasattr(self, '_df') or recreate: self._df = pd.DataFrame.from_records(self.events) return self._df
@property def events(self): """ For Math events, returns original events after excluding practice lists """ events = np.rec.array(self._events) return events[events.list > -1] @events.setter def events(self, new_events): if self._events is None: self._events = np.rec.array(new_events) @property def session_number(self): """ Session number """ return np.unique(self.events.session)[0] @property def num_problems(self): """Returns the total number of problems solved by the subject.""" return len(self.events[(self.events.type == 'PROB') | (self.events.type == b'PROB')]) @property def num_lists(self): """ Number of lists at least partially completed in the session """ return len(np.unique(self.events.list)) @property def num_correct(self): """Returns the number of problems solved correctly.""" return len(self.events[self.events.iscorrect == 1]) @property def percent_correct(self): """Returns the percentage of problems solved correctly.""" return 100 * self.num_correct / self.num_problems @property def problems_per_list(self): """Returns the mean number of problems per list.""" return self.num_problems / self.num_lists
[docs] @staticmethod def total_num_problems(summaries): """Get total number of problems for multiple sessions. Parameters ---------- summaries : List[MathSummary] Returns ------- : int """ return sum([summary.num_problems for summary in summaries])
[docs] @staticmethod def total_num_correct(summaries): """Get the total number of correctly answered problems for multiple sessions. Parameters ---------- summaries : List[MathSummary] Returns ------- : int """ return sum([summary.num_correct for summary in summaries])
[docs] @staticmethod @safe_divide def total_percent_correct(summaries): """Get the percent correct problems for multiple sessions. Parameters ---------- summaries : List[MathSummary] Returns ------- : float """ probs = MathSummary.total_num_problems(summaries) correct = MathSummary.total_num_correct(summaries) return 100 * correct / probs
[docs] @staticmethod def total_problems_per_list(summaries): """Get the mean number of problems per list for multiple sessions. Parameters ---------- summaries : List[MathSummary] Returns ------- float """ n_lists = sum([summary.num_lists for summary in summaries]) return MathSummary.total_num_problems(summaries) / n_lists
[docs]class Summary(Schema): """Base class for all session summary objects """ _events = ArrayOrNone( desc='task-related events excluding math distractor events') _raw_events = ArrayOrNone( desc='all event types including math distractor events') _bipolar_pairs = Unicode(desc='bipolar pairs in montage') _excluded_pairs = Unicode(desc='bipolar pairs not used for classification ' 'due to artifact or stimulation') _normalized_powers = ArrayOrNone(desc="normalized powers for all events " "and recorded pairs") @property def events(self): """ Numpy recarray of task events, i.e. the events used to train a classifier """ return np.rec.array(self._events) @events.setter def events(self, new_events): if self._events is None: self._events = np.rec.array(new_events) @property def raw_events(self): """ :class:`np.rec.array` of all events (math and task) from the session """ if self._raw_events is None: return None return np.rec.array(self._raw_events) @raw_events.setter def raw_events(self, new_events): if self._raw_events is None and new_events is not None: self._raw_events = np.rec.array(new_events)
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=None): """ Abstract method to be overriden by child classes """ raise NotImplementedError
[docs] @classmethod def create(cls, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=None): """Create a new summary object from events Parameters ---------- events : :class:`np.recarray` raw_events: :class:`np.recarray` bipolar_pairs: dict Dictionary containing data in bipolar pairs in a montage excluded_pairs: dict Dictionary containing data on pairs excluded from analysis normalized_powers: :class:`np.ndarray` 2D array of normalzied powers of shape n_events x ( n_frequencies * n_bipolar_pairs) """ instance = cls() instance.populate(events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events) return instance
[docs]class SessionSummary(Summary): """Base class for single-session objects.""" @property def subject(self): """ Subject ID associated with the session """ return extract_subject(self.events, add_localization=True) @property def experiment(self): """ Experiment name """ experiments = extract_experiment_from_events(self.events) return experiments[0] @property def session_number(self): """ Session number """ sessions = extract_sessions(self.events) if len(sessions) != 1: raise TooManySessionsError("Single session expected for session " "summary") session = str(sessions[0]) return session @property def id(self): return ":".join([self.subject, self.experiment, self.session_number]) @property def events(self): """ :class:`np.recarray` of events """ return np.rec.array(self._events) @events.setter def events(self, new_events): """Only allow setting of events which contain a single session.""" if self._events is None: self._events = np.rec.array(new_events) assert len(np.unique(new_events['session'])) == 1, \ "events should only be from a single session" @property def bipolar_pairs(self): """ Returns a dictionary of bipolar pairs""" return json.loads(self._bipolar_pairs) @bipolar_pairs.setter def bipolar_pairs(self, new_bipolar_pairs): self._bipolar_pairs = json.dumps(new_bipolar_pairs) @property def excluded_pairs(self): """ Returns a dictionary of bipolar pairs to be excluded in classifier training """ return json.loads(self._excluded_pairs) @excluded_pairs.setter def excluded_pairs(self, new_excluded_pairs): self._excluded_pairs = json.dumps(new_excluded_pairs) @property def n_pairs(self): """ Returns the number of bipolar pairs in the recording""" return len(self.bipolar_pairs[self.subject]['pairs']) @property def normalized_powers(self): """ Powers normalized to 0 mean and unit variance """ return self._normalized_powers @normalized_powers.setter def normalized_powers(self, new_normalized_powers): self._normalized_powers = new_normalized_powers @property def normalized_powers_covariance(self): return np.cov(self._normalized_powers.T) @property def normalized_powers_plot(self): """ Plots the matrix of normalized powers for the session to the specified filename or file-like object, and returns the plot as a base64-encoded string """ plot_buffer = io.BytesIO() save_power_plot(self.normalized_powers, self.session_number, plot_buffer) return encode_file(plot_buffer) @property def session_length(self): """Computes the total amount of time the session lasted in seconds.""" start = self.events.mstime.min() end = self.events.mstime.max() return (end - start) / 1000. @property def session_datetime(self): """Returns a timezone-aware datetime object of the end time of the session in UTC. """ timestamp = self.events.mstime.max() / 1000. return datetime.fromtimestamp(timestamp, pytz.utc) @property def num_lists(self): """ Number of lists completed in the session """ return len(np.unique(self.events.list))
[docs] def to_dataframe(self, recreate=False): """Convert the summary to a :class:`pd.DataFrame` for easier manipulation. This amounts to converting the events to a dataframe Keyword arguments ----------------- recreate : bool Force re-creating the dataframe. Otherwise, it will only be created the first time this method is called and stored as an instance attribute. Returns ------- pd.DataFrame """ if not hasattr(self, '_df') or recreate: self._df = pd.DataFrame.from_records(self.events) return self._df
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=None): """Populate attributes and store events.""" self.events = events self.raw_events = raw_events self.bipolar_pairs = bipolar_pairs self.excluded_pairs = excluded_pairs self.normalized_powers = normalized_powers
[docs]class FRSessionSummary(SessionSummary): """Free recall session summary data."""
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=None): """Populate data from events. Parameters ---------- events : np.recarray raw_events: np.recarray recall_probs : np.ndarray Predicted probabilities of recall per item. If not given, assumed there is no relevant classifier and values of -999 are used to indicate this. """ SessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events)
@property def intrusion_events(self): """ Recall events that were either extra-list or prior-list intrusions """ intr_events = self.raw_events[(self.raw_events.type == 'REC_WORD') & (self.raw_events.intrusion != -999) & (self.raw_events.intrusion != 0)] return intr_events @property def num_words(self): """ Number of words in the session """ return len(self.events[self.events.type == 'WORD']) @property def num_correct(self): """ Number of correctly-recalled words """ return np.sum(self.events[self.events.type == 'WORD'].recalled) @property def num_prior_list_intrusions(self): """ Calculates the number of prior list intrusions """ return np.sum((self.intrusion_events.intrusion > 0)) @property def num_extra_list_intrusions(self): """ Calculates the number of extra-list intrusions """ return np.sum((self.intrusion_events.intrusion == -1)) @property def num_lists(self): """Returns the total number of lists.""" return len(np.unique(self.events.list)) @property def percent_recalled(self): """Calculates the percentage correctly recalled words.""" return 100 * self.num_correct / self.num_words
[docs] @staticmethod def serialpos_probabilities(summaries, first=False): """Computes the mean recall probability by word serial position. Parameters ---------- summaries : List[Summary] Summaries of sessions. first : bool When True, return probabilities that each serial position is the first recalled word. Otherwise, return the probability of recall for each word by serial position. Returns ------- List[float] """ columns = ['serialpos', 'list', 'recalled', 'type'] events = pd.concat([pd.DataFrame(s.events[columns]) for s in summaries]) events = events[events.type == 'WORD'] if first: firstpos = np.zeros(len(events.serialpos.unique()), dtype=np.float) for listno in events.list.unique(): try: nonzero = events[(events.list == listno) & ( events.recalled == 1)].serialpos.iloc[0] except IndexError: # no items recalled this list continue thispos = np.zeros(firstpos.shape, firstpos.dtype) thispos[nonzero - 1] = 1 firstpos += thispos return (firstpos / events.list.max()).tolist() else: group = events.groupby('serialpos') return group.recalled.mean().tolist()
[docs]class CatFRSessionSummary(FRSessionSummary): """ Extends standard FR session summaries for categorized free recall experiments. """ _repetition_ratios = Unicode(desc='Repetition ratio by subject') irt_within_cat = Array( desc='average inter-response time within categories') irt_between_cat = Array( desc='average inter-response time between categories')
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=None, repetition_ratio_dict={}): """ Populates the CatFRSessionSummary object """ FRSessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events) self.repetition_ratios = repetition_ratio_dict # Calculate between and within IRTs based on the REC_WORD events as found in all_events.json # Exclude all intrusions so that a transition between an intrusion and a recall will not be # counted towards either within or between times. catfr_events = events[(events.experiment == 'catFR1') & (events.type == 'REC_EVENT') & (events.intrusion == 0) & (events.recalled == 1)] # recalled == 0 indicates a baseline recall event cat_recalled_events = catfr_events[(catfr_events.recalled == 1)] irt_within_cat = [] irt_between_cat = [] for session in np.unique(catfr_events.session): cat_sess_recalls = cat_recalled_events[cat_recalled_events.session == session] for list in np.unique(cat_sess_recalls.list): cat_sess_list_recalls = cat_sess_recalls[cat_sess_recalls.list == list] irts = np.diff(cat_sess_list_recalls.mstime) within = np.diff(cat_sess_list_recalls.category_num) == 0 irt_within_cat.extend(irts[within]) irt_between_cat.extend(irts[within == False]) self.irt_within_cat = irt_within_cat self.irt_between_cat = irt_between_cat
@property def raw_repetition_ratios(self): """ Dictionary where keys are subject identifiers for subjects completing at least one CatFR session and values are the repetition ratio for that subject by list """ mydict = json.loads(self._repetition_ratios) mydict = {k: np.array(v) for k, v in mydict.items()} return mydict @property def repetition_ratios(self): """ Dictionary where keys are subject identifiers for subjects completing at least one CatFR session and values are the repetition ratio for that subject averaged over the session """ return np.hstack([np.nanmean(v) for k, v in self.raw_repetition_ratios.items()]) @repetition_ratios.setter def repetition_ratios(self, new_repetition_ratios): serializable_ratios = {k: v.tolist() for k, v in new_repetition_ratios.items()} self._repetition_ratios = json.dumps(serializable_ratios) @property def irt_within_category(self): """ Within-category item response time """ return self.irt_within_cat @property def irt_between_category(self): """ Between category item response time """ return self.irt_between_cat @property def subject_ratio(self): """ Repetition ratio for the current subject """ return np.nanmean(self.raw_repetition_ratios[self.subject])
[docs]class StimSessionSummary(SessionSummary): """SessionSummary data specific to sessions with stimulation.""" _post_stim_prob_recall = CArray(dtype=np.float, desc='classifier output in post stim period', default=np.array([])) _model_metadata = Bytes(desc="traces for Bayesian multilevel models") _post_stim_eeg = ArrayOrNone(desc='raw post-stim EEG') _stim_tstats = CArray(dtype=[('stim_tstats', float),('stim_pvals', float)], desc='t-statistics from artifact detection') @property def post_stim_prob_recall(self): """ Classifier output in the post-stim period """ return self._post_stim_prob_recall @post_stim_prob_recall.setter def post_stim_prob_recall(self, new_post_stim_prob_recall): if new_post_stim_prob_recall is not None: self._post_stim_prob_recall = new_post_stim_prob_recall.flatten().tolist() @property def model_metadata(self): metadata = pickle.loads(self._model_metadata) return metadata @model_metadata.setter def model_metadata(self, new_model_metadata): """ Save the dictionary of model traces such that it can be stored in HDF5 """ # Use pickle to convert to byte string and then base64 encode/decode to remove # NULL characters that are not handled well by HDF5 metadata = pickle.dumps(new_model_metadata) metadata = base64.b64encode(metadata) self._model_metadata = metadata
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, post_stim_prob_recall=None, raw_events=None, model_metadata={}, post_stim_eeg=None, stim_tstats=None): """ Populate stim data from events """ SessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events) if post_stim_prob_recall is not None: self.post_stim_prob_recall = post_stim_prob_recall if len(model_metadata)>0: self.model_metadata = model_metadata if post_stim_eeg is not None: self._post_stim_eeg = post_stim_eeg if stim_tstats is not None: self._stim_tstats = stim_tstats
@classmethod def stim_tstats_by_condition(cls, session_summaries): good_tstats = [x for summary in session_summaries for x in summary.stim_tstats[summary.stim_pvals > 0.001]] bad_tstats = [x for summary in session_summaries for x in summary.stim_tstats[summary.stim_pvals < 0.001]] return good_tstats, bad_tstats @property def stim_tstats(self): return self._stim_tstats['stim_tstats'] @property def stim_pvals(self): return self._stim_tstats['stim_pvals'] @property def used_pair_mask(self): bipolar_pairs = pd.DataFrame.from_dict( self.bipolar_pairs[self.subject]['pairs'] ) bipolar_pairs = bipolar_pairs.T.sort_values(by=['channel_1', 'channel_2']) bipolar_pairs = bipolar_pairs.T.to_dict(into=OrderedDict) bipolar_pairs = OrderedDict({self.subject: {'pairs': bipolar_pairs}}) return get_used_pair_mask(bipolar_pairs, self.excluded_pairs) @property def n_excluded_pairs(self): return len(self.used_pair_mask) - sum(self.used_pair_mask) @property def post_stim_eeg_plot(self): if self._post_stim_eeg is None: return '' else: pairs = ['%s-\n%s' % (pair['label0'], pair['label1']) for pair in generate_pairs_for_classifier(self.bipolar_pairs, []) ] used_pair_mask = self.used_pair_mask return [encode_file(save_eeg_by_channel_plot(pairs[i:i+1], self._post_stim_eeg[i:i+1], used_pair_mask[i:i+1])) for i in range(len(pairs))] @property def subject(self): """ Subject ID associated with the session """ return extract_subject(self.events, add_localization=False)
[docs]class FRStimSessionSummary(FRSessionSummary, StimSessionSummary): """ SessionSummary for FR sessions with stim """
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, post_stim_prob_recall=None, raw_events=None, model_metadata={}, post_stim_eeg=None, stim_tstats=None): FRSessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events) StimSessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, post_stim_prob_recall=post_stim_prob_recall, raw_events=raw_events, model_metadata=model_metadata, post_stim_eeg=post_stim_eeg, stim_tstats=stim_tstats)
[docs] @staticmethod def combine_sessions(summaries): """ Combine information from multiple stim sessions """ all_summary_dfs = [] for summary in summaries: df = summary.to_dataframe() all_summary_dfs.append(df) combined_df = pd.concat(all_summary_dfs) return combined_df
@staticmethod def all_post_stim_prob_recall(summaries, phase=None): post_stim_prob_recall = [ summary.post_stim_prob_recall for summary in summaries] post_stim_prob_recall = np.concatenate(post_stim_prob_recall).tolist() return post_stim_prob_recall
[docs] @staticmethod def pre_stim_prob_recall(summaries, phase=None): """ Classifier output in the pre-stim period for items that were eventually stimulated """ df = FRStimSessionSummary.combine_sessions(summaries) pre_stim_probs = df[df['is_stim_item'] == True].classifier_output.values.tolist() return pre_stim_probs
[docs] @staticmethod def num_nonstim_lists(summaries): """Returns the number of non-stim lists.""" df = FRStimSessionSummary.combine_sessions(summaries) count = 0 for listno in df.list.unique(): if not df[df.list == listno].is_stim_list.all(): count += 1 return count
[docs] @staticmethod def num_stim_lists(summaries): """Returns the number of stim lists.""" df = FRStimSessionSummary.combine_sessions(summaries) count = 0 for listno in df.list.unique(): if df[df.list == listno].is_stim_list.all(): count += 1 return count
[docs] @staticmethod def stim_events_by_list(summaries): """ Array containing the number of stim events by list """ df = FRStimSessionSummary.combine_sessions(summaries) n_stim_events = df.groupby('list').is_stim_item.sum().tolist() return n_stim_events
[docs] @staticmethod def prob_stim_by_serialpos(summaries): """ Array containing the probability of stimulation (mean of the classifier output) by serial position """ df = FRStimSessionSummary.combine_sessions(summaries) return df.groupby('serialpos').classifier_output.mean().tolist()
[docs] @staticmethod def lists(summaries, stim=None): """ Get a list of either stim lists or non-stim lists """ df = FRStimSessionSummary.combine_sessions(summaries) if stim is not None: lists = df[df.is_stim_list == stim].list.unique().tolist() else: lists = df.list.unique().tolist() return lists
@property def stim_columns(self): """ Fields associated with stimulation parameters """ return ['stimAnodeTag', 'stimCathodeTag', 'location', 'amplitude', 'stim_duration', 'pulse_freq']
[docs] @staticmethod def stim_params_by_list(summaries): """ Returns a dataframe of stimulation parameters used within each session/list """ df = FRStimSessionSummary.combine_sessions(summaries) df = df.replace('nan', np.nan) stim_columns = ['stimAnodeTag', 'stimCathodeTag', 'location', 'amplitude', 'stim_duration', 'pulse_freq'] non_stim_columns = [c for c in df.columns if c not in stim_columns] static_columns = [c for c in ['subject', 'experiment', 'session', 'list'] if c in df.columns] stim_param_by_list = (df[(stim_columns + static_columns)] .drop_duplicates() .dropna(how='all')) # This ensures that for any given list, the stim parameters used # during that list are populated. This makes calculating post stim # item behavioral responses easier df = df[non_stim_columns] df = df.merge(stim_param_by_list, on=['subject', 'experiment', 'session', 'list'], how='left') return df
[docs] @staticmethod def stim_parameters(summaries): """ Returns a list of unique stimulation parameters used during the experiment """ df = FRStimSessionSummary.stim_params_by_list(summaries) return FRStimSessionSummary.aggregate_stim_params_over_list(df)
@staticmethod def aggregate_stim_params_over_list(df): df['location'] = df['location'].replace(np.nan, '--') stim_columns = ['stimAnodeTag', 'stimCathodeTag', 'location', 'amplitude', 'stim_duration', 'pulse_freq'] grouped = (df.groupby(by=(stim_columns + ['is_stim_list'])) .agg({'is_stim_item': 'sum', 'subject': 'count'}) .rename(columns={'is_stim_item': 'n_stimulations', 'subject': 'n_trials'}) .reset_index()) return list(grouped.T.to_dict().values())
[docs] @staticmethod def recall_test_results(summaries, experiment): """ Returns a dictionary containing the results of chi-squared tests for the behavioral effects of stimulation. Comparisons include stim lists vs. non-stim lists, stim items vs. low-biomarker non-stim items, and post-stim items vers. low-biomarker non-stim items. All comparisons are done for each unique set of stimulation parameters """ df = FRStimSessionSummary.stim_params_by_list(summaries) if "PS5" not in experiment: df = df[df.list > 3] else: df = df[df.list > -1] results = [] for name, group in df.groupby(['stimAnodeTag', 'stimCathodeTag', 'amplitude', 'stim_duration', 'pulse_freq']): parameters = "/".join([str(n) for n in name]) # Stim lists vs. non-stim lists n_correct_stim_list_recalls = group[group.is_stim_list == True].recalled.sum( ) n_correct_nonstim_list_recalls = df[df.is_stim_list == False].recalled.sum( ) n_stim_list_words = group[group.is_stim_list == True].recalled.count() n_nonstim_list_words = df[df.is_stim_list == False].recalled.count() tstat_list, pval_list, _ = proportions_chisquare([ n_correct_stim_list_recalls, n_correct_nonstim_list_recalls], [n_stim_list_words, n_nonstim_list_words]) results.append({"parameters": parameters, "comparison": "Stim Lists vs. Non-stim Lists", "stim": (n_correct_stim_list_recalls, n_stim_list_words), "non-stim": (n_correct_nonstim_list_recalls, n_nonstim_list_words), "t-stat": tstat_list, "p-value": pval_list}) # stim items vs. non-stim low biomarker items n_correct_stim_item_recalls = group[group.is_stim_item == True].recalled.sum( ) n_correct_nonstim_item_recalls = df[(df.is_stim_item == False) & (df.classifier_output < df.thresh)].recalled.sum() n_stim_items = group[group.is_stim_item == True].recalled.count() n_nonstim_items = df[(df.is_stim_item == False) & (df.classifier_output < df.thresh)].recalled.count() tstat_list, pval_list, _ = proportions_chisquare( [n_correct_stim_item_recalls, n_correct_nonstim_item_recalls], [n_stim_items, n_nonstim_items]) results.append({ "parameters": parameters, "comparison": "Stim Items vs. Low Biomarker Non-stim Items", "stim": (n_correct_stim_item_recalls, n_stim_items), "non-stim": (n_correct_nonstim_item_recalls, n_nonstim_items), "t-stat": tstat_list, "p-value": pval_list}) # post stim items vs. non-stim low biomarker items n_correct_post_stim_item_recalls = group[group.is_post_stim_item == True].recalled.sum( ) n_post_stim_items = group[group.is_post_stim_item == True].recalled.count() tstat_list, pval_list, _ = proportions_chisquare( [n_correct_post_stim_item_recalls, n_correct_nonstim_item_recalls], [n_post_stim_items, n_nonstim_items]) results.append({ "parameters": parameters, "comparison": "Post-stim Items vs. Low Biomarker Non-stim Items", "stim": (n_correct_post_stim_item_recalls, n_post_stim_items), "non-stim": (n_correct_nonstim_item_recalls, n_nonstim_items), "t-stat": tstat_list, "p-value": pval_list}) return results
[docs] @staticmethod def recalls_by_list(summaries, stim_list_only=False): """ Number of recalls by list. Optionally returns results for only stim lists """ df = FRStimSessionSummary.combine_sessions(summaries) if stim_list_only: recalls_by_list = ( df[df.is_stim_list == stim_list_only] .groupby('list') .recalled .sum() .astype(int) .tolist()) else: recalls_by_list = ( df.groupby('list') .recalled .sum() .astype(int) .tolist()) return recalls_by_list
[docs] @staticmethod def prob_first_recall_by_serialpos(summaries, stim=False): """ Probability of recalling a word first by serial position. Optionally returns results for only stim items """ df = FRStimSessionSummary.combine_sessions(summaries) events = df[df.is_stim_item == stim] firstpos = np.zeros( ExperimentParameters().number_of_items, dtype=np.float) for listno in events.list.unique(): try: nonzero = events[(events.list == listno) & (events.recalled == 1)].serialpos.iloc[0] except IndexError: # no items recalled this list continue thispos = np.zeros(firstpos.shape, firstpos.dtype) thispos[nonzero - 1] = 1 firstpos += thispos return (firstpos / events.list.max()).tolist()
[docs] @staticmethod def prob_recall_by_serialpos(summaries, stim_items_only=False): """ Probability of recall by serial position. Optionally returns results for only stim items """ df = FRStimSessionSummary.combine_sessions(summaries) group = df[df.is_stim_item == stim_items_only].groupby('serialpos') return group.recalled.mean().tolist()
[docs] @staticmethod def delta_recall(summaries, post_stim_items=False): """ %change in item recall for stimulated items versus non-stimulated low biomarker items. Optionally return the same comparison, but for post-stim items """ df = FRStimSessionSummary.combine_sessions(summaries) nonstim_low_bio_recall = df[(df.classifier_output < df.thresh) & (df.is_stim_list == False)].recalled.mean() if post_stim_items: recall_stim = df[df.is_post_stim_item == True].recalled.mean() else: recall_stim = df[df.is_stim_item == True].recalled.mean() delta_recall = 100 * \ ((recall_stim - nonstim_low_bio_recall) / df.recalled.mean()) return delta_recall
class FR5SessionSummary(FRStimSessionSummary): """ FR5-specific summary """ def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, post_stim_prob_recall=None, raw_events=None, model_metadata={}): """ Constructor for the object """ FRStimSessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events, post_stim_prob_recall=post_stim_prob_recall, model_metadata=model_metadata) class TICLFRSessionSummary(FRStimSessionSummary): biomarker_events = ArrayOrNone def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, post_stim_prob_recall=None, raw_events=None, model_metadata={}, post_stim_eeg=None, biomarker_events=None, stim_tstats=None): FRStimSessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, post_stim_prob_recall, raw_events, model_metadata, post_stim_eeg, stim_tstats=stim_tstats) self.biomarker_events = biomarker_events def nstims(self, task_phase): """ Number of stim events within t :param task_phase: :return: """ if self.raw_events is None: return 0 return (self.raw_events[self.raw_events.type=='STIM_ON' ].phase == task_phase).sum() def classifier_output(self, phase, position): """ :param phase: either "ENCODING", "DISTRACT", or "RETRIEVAL" :param position: either "pre" or "post" :return: """ biomarker_events = self.biomarker_events[ self.biomarker_events['biomarker_value'] >= 0 ] in_phase = biomarker_events['phase'] == phase this_position = biomarker_events['position'] == position if position == 'post': return biomarker_events[in_phase & this_position]['biomarker_value'] else: # Only want """real""" pre-stim events, i.e. ones with a matching # post-stim event ids = biomarker_events[in_phase & this_position]['id'] has_match = np.in1d(ids, biomarker_events[~this_position ]['id']) return biomarker_events[ (in_phase & this_position) ][has_match]['biomarker_value'] @staticmethod def pre_stim_prob_recall(summaries, phase=None): if phase is None: phases = ['ENCODING', 'DISTRACT', 'RETRIEVAL'] else: phases = [phase] return np.concatenate([ summary.classifier_output(phase_, 'pre') for summary in summaries for phase_ in phases ]).tolist() @staticmethod def all_post_stim_prob_recall(summaries, phase=None): if phase is None: phases = ['ENCODING', 'DISTRACT', 'RETRIEVAL'] else: phases = [phase] return np.concatenate([ summary.classifier_output(phase_, 'post') for summary in summaries for phase_ in phases ]).tolist()
[docs]class PSSessionSummary(SessionSummary): """ Parameter Search experiment summary """
[docs] def populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=None): SessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, normalized_powers, raw_events=raw_events) return
@property def decision(self): """ Return a dictionary containing decision information from the Bayesian optimization algorithm """ decision_dict = { 'converged': True, 'sham_dc': '', 'sham_sem': '', 'best_location': '', 'best_amplitude': '', 'pval': '', 'tstat': '', 'tie': '', 'tstat_vs_sham': '', 'pval_vs_sham': '', 'loc1': {}, 'loc2': {}, } events_df = pd.DataFrame.from_records([e for e in self.events], columns=self.events.dtype.names) decision = self.events[(self.events.type == 'OPTIMIZATION_DECISION')] # If a session completes with convergence, there will be an # optimization decision event at the end. Otherwise, we need to # manually calculate one if len(decision) > 0: decision_dict['sham_dc'] = decision.sham.delta_classifier[0] decision_dict['sham_sem'] = decision.sham.sem[0] decision_dict['best_location'] = decision.decision.best_location[0] decision_dict['best_amplitude'] = ( decision.loc1 if decision.loc1.loc_name == decision_dict[ 'best_location'] else decision.loc2).amplitude[0] decision_dict['pval'] = decision.decision.p_val[0] decision_dict['tstat'] = decision.decision.t_stat[0] decision_dict['tie'] = decision.decision.tie[0] decision_dict['tstat_vs_sham'] = decision.sham.t_stat[0] decision_dict['pval_vs_sham'] = decision.sham.p_val[0] decision_dict['loc1'] = decision.loc1 decision_dict['loc2'] = decision.loc2 else: decision_dict['converged'] = False opt_events = events_df.loc[events_df.type == 'OPTIMIZATION'] # This should win an award for least-readable line of python code (locations, loc_datasets) = zip(*[('_'.join(name), table.loc[:, ['amplitude', 'delta_classifier']].values) for (name, table) in opt_events.groupby(('anode_label', 'cathode_label'))]) # TODO: include sham delta classifiers when we need to reconstruct # results if len(locations) > 1: decision, loc_info = choose_location(loc_datasets[0], locations[0], loc_datasets[1], locations[1], np.array([(ld.min(), ld.max()) for ld in loc_datasets]), None) else: return for i, k in enumerate(loc_info): loc_info[k]['amplitude'] = loc_info[k]['amplitude'] / 1000 decision_dict['loc%s' % (i+1)] = loc_info[k] decision_dict['tie'] = decision['Tie'] decision_dict['best_location'] = decision['best_location_name'] decision_dict['best_amplitude'] = loc_info[ decision_dict['best_location']]['amplitude'] decision_dict['pval'] = decision['p_val'] decision_dict['tstat'] = decision['t_stat'] return decision_dict @property def location_summary(self): """ Return a dictionary whose keys are the locations stimulated in the experiment and values are a dictionary containing additional metadata about the results from stimulating at that location """ location_summaries = {} events_df = pd.DataFrame.from_records([e for e in self.events], columns=self.events.dtype.names) events_by_location = events_df.groupby(['anode_label', 'cathode_label']) for location, loc_events in events_by_location: location_summary = { 'amplitude': {}, 'delta_classifier': {}, 'post_stim_biomarker': {}, 'post_stim_amplitude': {}, 'best_amplitude': '', 'best_delta_classifier': '', 'sem': '', 'snr': '' } if location[0] and location[1]: loc_tag = '%s_%s' % (location[0], location[1]) opt_events = (loc_events.loc[loc_events.type == 'OPTIMIZATION'] .groupby('list_phase')) for i, (phase, phase_opt_events) in enumerate(opt_events): post_stim_phase_events = loc_events.loc[ (events_df.list_phase == phase) & (events_df.type == 'BIOMARKER') & (events_df.position == 'POST')] decision = self.decision if decision['loc1']['loc_name'] == loc_tag: loc_decision_info = decision['loc1'] else: loc_decision_info = decision['loc2'] location_summary['amplitude'][phase] \ = (phase_opt_events.amplitude.values / 1000.).tolist() location_summary['delta_classifier'][phase] = \ phase_opt_events.delta_classifier.values.tolist() location_summary['post_stim_biomarker'][ phase] = post_stim_phase_events.biomarker_value.tolist() location_summary['post_stim_amplitude'][phase] = \ (post_stim_phase_events.amplitude.values / 1000.).tolist() if len(loc_decision_info) > 0: location_summary['best_amplitude'] = float( loc_decision_info['amplitude']) location_summary['best_delta_classifier'] = float( loc_decision_info['delta_classifier']) location_summary['sem'] = float( loc_decision_info['sem']) location_summary['snr'] = float( loc_decision_info['snr']) location_summaries[loc_tag] = location_summary return location_summaries
class LocationSearchSessionSummary(StimSessionSummary): connectivity = Array pre_psd = Array post_psd = Array _bad_events_mask = CArray _bad_channels_mask = CArray _regressions = ArrayOrNone @property def bipolar_pairs_frame(self): bpdict = self.bipolar_pairs[self.subject]['pairs'] bpdf = pd.DataFrame.from_dict(bpdict,orient='index') bpdf.channel_1 = bpdf.channel_1.astype(int) bpdf.channel_2 = bpdf.channel_2.astype(int) return bpdf.sort_values(by=['channel_1', 'channel_2']).reset_index() @property def distmat(self): return get_distances(self.bipolar_pairs_frame) @property def stim_channel_idxs(self): return tmi.get_stim_channels(self.bipolar_pairs_frame, self.events, 'stimAnodeTag', 'stimCathodeTag') @property def bad_channels_mask(self): # TODO: paramtrize the 20 here return self._bad_channels_mask | ((self._bad_events_mask.sum(0) > 20).squeeze()) @property def used_pair_mask(self): return ~self.bad_channels_mask @property def n_excluded_pairs(self): return self.bad_channels_mask.sum() @property def regressions(self): if self._regressions is None: self._regressions, _ = tmi.regress_distance( self.pre_psd,self.post_psd, self.connectivity, self.distmat, self.stim_channel_idxs, event_mask=self._bad_events_mask, artifact_channels=self._bad_channels_mask) return self._regressions @property def stim_tag(self): return '-'.join(LocationSearchSessionSummary.stim_params([self])[0][['stimAnodeTag', 'stimCathodeTag']]) @property def id(self): return ":".join([self.subject, self.experiment,self.session_number,self.stim_tag]) @property def tmi(self): return tmi.compute_tmi(self.regressions) @staticmethod def stim_params(summaries): df = FRStimSessionSummary.combine_sessions(summaries) stim_columns = FRStimSessionSummary().stim_columns stim_columns = [c for c in stim_columns if c in df.columns] stim_params_table = df[stim_columns].drop_duplicates().dropna(how='all') stim_channel_labels = [summary.bipolar_pairs_frame.iloc[idx]['label'] for summary in summaries for idx in summary.stim_channel_idxs ] tmi_list = [tmi_val['zscore'] for summary in summaries for tmi_val in summary.tmi] for (stim_channel, tmi_val) in zip(stim_channel_labels, tmi_list): anode,cathode = stim_channel.split('-') stim_params_table.loc[(stim_params_table.stimAnodeTag == anode) & (stim_params_table.stimCathodeTag == cathode), 'TMI'] = tmi_val return stim_params_table.dropna().to_records() def populate(self,events, bipolar_pairs, excluded_pairs, connectivity, pre_psd, post_psd, bad_events_mask, bad_channel_mask, stim_tstats=None,**kwargs): StimSessionSummary.populate(self, events, bipolar_pairs, excluded_pairs, None, stim_tstats=stim_tstats,**kwargs) self.connectivity = connectivity self.post_psd = post_psd self.pre_psd = pre_psd self._bad_channels_mask = bad_channel_mask self._bad_events_mask = bad_events_mask