Source code for meticulous.experiments
from git import Repo
import sys
from glob import glob
import os
import json
import traceback
import pandas as pd
from pandas.io.json import json_normalize
[docs]class ExperimentReader(object):
"""Class to read an experiment folder"""
[docs] def __init__(self, curexpdir:str):
"""
Read experiment data from curexpdir. Reads metadata.json, args.json, default_args.json, STATUS and summary.json.
Args:
curexpdir: The experiment directory to read
"""
self.curexpdir = curexpdir
"""str: Path to the directory for the current experiment"""
self.expid = self.curexpdir.split(os.sep)[-2]
"""str: experiment id"""
# Load metadata
self.metadata = {}
"""dict: loaded from metadata.json"""
try:
with self.open('metadata.json', 'r') as f:
self.metadata = json.load(f)
except FileNotFoundError as e:
pass
# Extract useful attributes
self.metadata['command'] = ' '.join(self.metadata.get('command', []))
self.sha = self.metadata.get('githead-sha', None)
self.start_time = self.metadata.get('start-time', os.path.getctime(self.curexpdir))
# Load args
#: dict: loaded from args.json
self.args = {}
try:
with self.open('args.json', 'r') as f:
self.args = json.load(f)
except FileNotFoundError as e:
pass
# Load default args
#: dict: loaded from default_args.json
self.default_args = {}
try:
with self.open('default_args.json', 'r') as f:
self.default_args = json.load(f)
except FileNotFoundError as e:
pass
# Load status
self.status = 'UNKNOWN' # First line of STATUS file
self.status_message = '' # Last line of STATUS file (usually contains the Python error)
self.refresh_status()
# Load summary
#: dict: loaded from summary.json
self.summary = {}
self.refresh_summary()
[docs] def open(self, *args, **kwargs):
"""wrapper around the function open to redirect to experiment directory"""
args = (os.path.join(self.curexpdir, args[0]),)+ args[1:]
return open(*args, **kwargs)
[docs] def refresh_status(self):
"""Read STATUS file"""
try:
with self.open('STATUS', 'r') as f:
ls = list(f)
self.status = ls[0]
self.status_message = '' if len(ls) <= 1 else ls[-1]
except (FileNotFoundError, IndexError):
pass
[docs] def refresh_summary(self):
"""Read summary.json"""
try:
with self.open('summary.json', 'r') as f:
self.summary = json.load(f)
except FileNotFoundError:
pass
def __repr__(self):
return self.curexpdir
[docs]class Experiments(object):
"""Class to load an experiments folder"""
[docs] def __init__(self, project_directory:str = '', experiments_directory:str = None, reader = ExperimentReader):
"""
Load the repo from project_directory and experiments from expdir using ExperimentReader class.
Args:
project_directory: Path to the project directory, should be part of a git repo.
experiments_directory: Path to the directory that stores experiments. If a relative path is specified then it is relative to the project directory. Created if it doesn't exist.
reader: To allow overriding with a user defined version of ExperimentReader class.
"""
self.project_directory = project_directory
self.repo = Repo(self.project_directory, search_parent_directories=True)
self.repodir = self.repo.working_dir
if experiments_directory:
self.experiments_directory = experiments_directory
else:
self.experiments_directory = os.path.join(self.project_directory, 'experiments')
self.reader = reader
self.experiments = {}
"""Dict[ExperimentReader]: experiment ids mapped to respective ExperimentReader objects """
self.refresh_experiments()
[docs] def refresh_experiments(self):
"""Read experiments from the file system"""
experiments = []
for exp in glob(self.experiments_directory+'/*/'):
try:
experimentReader = self.reader(exp)
experiments.append(experimentReader)
except Exception as e:
print("Unable to read {exp}".format(exp=exp), file=sys.stderr)
traceback.print_exc(file=sys.stderr)
self.experiments = {e.expid: e for e in sorted(experiments, key = lambda expReader: expReader.start_time)}
[docs] def as_dataframe(self):
"""Returns all experiment data as a pandas dataframe"""
if len(self.experiments.values()) > 0:
df = json_normalize([vars(e) for e in self.experiments.values()]).set_index('expid')
# Convert json_normalized columns into multilevel columns for ease of use and nicer printing
max_col_levels = max(len(c.split('.')) for c in df.columns)
df.columns = pd.MultiIndex.from_tuples(
[[''] * (max_col_levels - len(level_vals.split('.'))) + level_vals.split('.') for level_vals in
df.columns])
return df
else:
raise IndexError("Unable to load any experiments")
def __getitem__(self, key):
return self.experiments[key]