Source code for ramutils.tasks.misc

from __future__ import unicode_literals

import os
import pandas as pd

try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse

from ptsa.data.readers import JsonIndexReader
from ramutils.io import store_results
from ramutils.utils import get_session_str

from ._wrapper import task
from ramutils.reports.summary import *
from ramutils.hmm import save_foresplot, save_traceplot
from ramutils.utils import is_stim_experiment as is_stim_experiment_core
from ramutils.utils import get_completed_sessions
from ramutils.utils import encode_file
from ramutils.utils import extract_experiment_series
from ramutils.log import get_logger

logger = get_logger()


__all__ = [
    'read_index',
    'is_stim_experiment',
    'save_all_output',
    'load_existing_results'
]


[docs]@task()
def read_index(mount_point='/'):
    """Reads the JSON index reader.

    :param str mount_point: Root directory to search for.
    :returns: JsonIndexReader

    """
    path = os.path.join(mount_point, 'protocols', 'r1.json')
    return JsonIndexReader(path)


@task(cache=False)
def is_stim_experiment(experiment):
    is_stim = is_stim_experiment_core(experiment)
    return is_stim


[docs]@task(cache=False)
def save_all_output(subject, experiment, session_summaries, math_summaries,
                    classifier_evaluation_results, save_location,
                    retrained_classifier=None, target_selection_table=None,
                    behavioral_results=None, agg_report=False):
    """ Save all required output necessary to re-generate a report

    Parameters:
    -----------
    subject: str
        Subject ID
    experiment: str
        Experiment name
    session_summaries: List
        List of SessionSummary derived objects
    math_summaries: List
        List of MathSummary objects
    classifier_evaluation_results: List
        List of ClassifierSummary objects
    save_location: str
        Destination for data to be saved. Typically in
        /data10/RAM/report_database/ on RHINO
    retrained_classifier: ClassifierContainer
        Serialized representation of the retrained classifier
    target_selection_table pd.DataFrame
        DataFrame representation of the target selection table, formerly known
        as the subsequent memory effect table
    behavioral_results: dict
        Keys are the behavioral effect model type (stim list, stim item, etc.)
        and values are the traces from estimating those models

    Returns
    -------
    results_files: dict
        Dictionary whose keys are the names of statically-produced plots and
        values are encoded versions of those images. These are used to embed
        the static plots in the html reports during report generation

    Notes
    -----
    All output files are of the format {subject}_{experiment}_{session}_{data_type}.{file_type}
    where data_type is a generic name for the type of data being saved. The following data types
    map to a summary object:

    * sessions_summary: :class:`ramutils.reports.summary.SessionSummary`
    * math_summary: :class:`ramutils.reports.summary.MathSummary`
    * classifier_[tag]: :class:`ramutils.reports.summary.ClassifierSummary`

    """

    result_files = {}

    base_output_format = os.path.join(save_location,
                                      "{subject}_{experiment}_{session}_{"
                                      "data_type}.{file_type}")
    subject_specific_output = os.path.join(save_location,
                                      "{subject}_{data_type}.{file_type}")

    session_str = '_'.join([str(summary.session_number) for summary in
                            session_summaries])
    # Agg reports could have hundreds of sessions, so do not save them
    # as part of the file name
    if agg_report:
        session_str = ""

    if (target_selection_table is not None) and \
            (len(target_selection_table) > 0):
        target_selection_table.to_csv(
            base_output_format.format(subject=subject,
                                      experiment=experiment,
                                      session=session_str,
                                      data_type='target_selection_table',
                                      file_type='csv'))

    for session_summary in session_summaries:
        session = session_summary.session_number
        store_results(session_summary, base_output_format.format(
            subject=subject, experiment=experiment, session=session,
            data_type='session_summary', file_type='h5'))

    for math_summary in math_summaries:
        session = math_summary.session_number
        store_results(math_summary, base_output_format.format(
            subject=subject, experiment=experiment, session=session,
            data_type='math_summary', file_type='h5'))

    for classifier_summary in classifier_evaluation_results:
        sessions = classifier_summary.sessions
        session_str = get_session_str(sessions)
        store_results(classifier_summary, base_output_format.format(
            subject=subject, experiment=experiment, session=session_str,
            data_type='classifier_' + classifier_summary.tag,
            file_type='h5'))

    if retrained_classifier is not None:
        # The retrained classifier is the same regardless of session/experiment
        # so just save one per subject
        retrained_classifier.save(subject_specific_output.format(
            subject=subject, data_type="retrained_classifier",
            file_type="zip"), overwrite=True)

    # Save plots from hmm models and return file paths in a dict
    if behavioral_results is not None:
        for name, trace in behavioral_results.items():
            forestplot_path = base_output_format.format(subject=subject,
                                                        experiment=experiment,
                                                        session=session_str,
                                                        data_type=(name +
                                                                   '_foresplot'),
                                                        file_type='png')
            save_foresplot(trace, forestplot_path)
            traceplot_path = base_output_format.format(subject=subject,
                                                       experiment=experiment,
                                                       session=session_str,
                                                       data_type=(name +
                                                                  '_traceplot'),
                                                       file_type='png')
            save_traceplot(trace, traceplot_path)

            with open(forestplot_path, 'rb') as f:
                encoded_image = encode_file(f)
            result_files[name] = encoded_image

    return result_files


[docs]@task(cache=False)
def load_existing_results(subject, experiment, sessions, stim_report, db_loc,
                          joint_report, rootdir='/'):
    """ Load previously-saved data creating during report generation

    Parameters:
    -----------
    subject: str
        Subject ID
    experiment: str
        Experiment ID
    sessions: list or None
        If none, then sessions are looked up from r1.json for the given subject and experiment.
    stim_report: bool
        Indicator for if the requested data is associated with a stim report
    db_loc: str
        Report database location relative to rootdir. db_loc will be appended to rootdir
        to find the full absolute path. If both db_loc and rootdir are absolute paths,
        it will be assumed that db_loc contains the root directory.
    rootdir: str
        RHINO mount point or root directory

    Returns:
    --------
    saved_results: dict
        Mirrors the input to save_all_output

    """

    saved_results = {
        'target_selection_table': None,
        'classifier_evaluation_results': None,
        'session_summaries': None,
        'math_summaries': None,
        'hmm_results': None
    }

    # Repetition ratio dictionary optional
    # Cases: PS, stim, non-stim
    subject_experiment = "_".join([subject, experiment])
    base_output_format = os.path.join(rootdir, db_loc, subject_experiment +
                                      "_{session}_{data_type}.{file_type}")

    if sessions is None:
        if joint_report and 'FR' in experiment:
            series_num = extract_experiment_series(experiment)
            fr_sessions = get_completed_sessions(subject, 'FR'+series_num,
                                                 rootdir)
            catfr_sessions = get_completed_sessions(subject, 'catFR'+series_num,
                                                    rootdir)
            catfr_sessions = set(str(100 + int(s)) for s in catfr_sessions)
            sessions = fr_sessions | catfr_sessions
        else:
            sessions = get_completed_sessions(subject, experiment, rootdir=rootdir)

    session_str = get_session_str(sessions)
    target_selection_table = None
    hmm_results = {}
    try:
        if stim_report is False:
            target_selection_table = pd.read_csv(
                base_output_format.format(session=session_str,
                                          data_type='target_selection_table',
                                          file_type='csv'))
            saved_results['target_selection_table'] = target_selection_table

            encoding_classifier_summary = ClassifierSummary.from_hdf(
                base_output_format.format(session=session_str,
                                          data_type='classifier_Encoding',
                                          file_type='h5'))
            joint_classifier_summary = ClassifierSummary.from_hdf(
                base_output_format.format(session=session_str,
                                          data_type='classifier_Joint',
                                          file_type='h5'))
            classifier_evaluation_results = [encoding_classifier_summary,
                                             joint_classifier_summary]
            saved_results['classifier_evaluation_results'] = classifier_evaluation_results

            session_summaries, math_summaries = [], []
            for session in sessions:
                math_summary = MathSummary.from_hdf(
                    base_output_format.format(session=str(session),
                                              data_type='math_summary',
                                              file_type='h5')
                )
                math_summaries.append(math_summary)
                if (experiment == 'catFR1') or (int(session) >= 100):
                    summary = CatFRSessionSummary
                elif experiment == 'FR1':
                    summary = FRSessionSummary

                session_summary = summary.from_hdf(
                    base_output_format.format(session=str(session),
                                              data_type='session_summary',
                                              file_type='h5'))
                session_summaries.append(session_summary)

            saved_results['session_summaries'] = session_summaries
            saved_results['math_summaries'] = math_summaries

        elif stim_report and 'PS' not in experiment:
            classifier_evaluation_results, math_summaries, session_summaries = [], [], []
            for session in sessions:
                classifier_summary = ClassifierSummary.from_hdf(
                    base_output_format.format(session=str(session),
                                              data_type='classifier_session_' +
                                              str(session),
                                              file_type='h5'))
                classifier_evaluation_results.append(classifier_summary)

                math_summary = MathSummary.from_hdf(
                    base_output_format.format(session=str(session),
                                              data_type='math_summary',
                                              file_type='h5'))
                math_summaries.append(math_summary)

                if 'TICL' in experiment:
                    klass = TICLFRSessionSummary
                else:
                    klass = FRStimSessionSummary
                session_summary = klass.from_hdf(
                    base_output_format.format(session=str(session),
                                              data_type='session_summary',
                                              file_type='h5'))
                session_summaries.append(session_summary)

                # Check if behavioral model results are saved
                if 'FR5' in experiment:
                    for name in ['list', 'stim_item', 'post_stim_item']:
                        forestplot_path = base_output_format.format(
                            subject=subject,
                            experiment=experiment,
                            session=str(session),
                            data_type=(name + '_foresplot'),
                            file_type='png')
                        assert os.path.exists(forestplot_path)

                        # Encode the image and pass along that data
                        with open(forestplot_path, 'rb') as f:
                            encoded_image = encode_file(f)
                        hmm_results[name] = encoded_image
                    saved_results['hmm_results'] = hmm_results

            saved_results['session_summaries'] = session_summaries
            saved_results['math_summaries'] = math_summaries
            saved_results['classifier_evaluation_results'] = classifier_evaluation_results

        else:
            return saved_results

    except (IOError, OSError, AssertionError):
        logger.warning('Not all underlying data could be found for the '
                       'requested report, building from scratch instead.')
        return saved_results

    return saved_results