Commit 50ab52cb authored by Evan Shelhamer's avatar Evan Shelhamer
Browse files

Merge pull request #2350 from drdan14/log-parser-python-improved

Python log parser improvements
parents d3cc5803 23d28fde
......@@ -3,7 +3,7 @@
"""
Parse training log
Competitor to parse_log.sh
Evolved from parse_log.sh
"""
import os
......@@ -11,18 +11,7 @@ import re
import extract_seconds
import argparse
import csv
def get_line_type(line):
"""Return either 'test' or 'train' depending on line type
"""
line_type = None
if line.find('Train') != -1:
line_type = 'train'
elif line.find('Test') != -1:
line_type = 'test'
return line_type
from collections import OrderedDict
def parse_log(path_to_log):
......@@ -36,70 +25,112 @@ def parse_log(path_to_log):
for the two dict_lists
"""
re_iteration = re.compile('Iteration (\d+)')
re_accuracy = re.compile('output #\d+: accuracy = ([\.\d]+)')
re_train_loss = re.compile('Iteration \d+, loss = ([\.\d]+)')
re_output_loss = re.compile('output #\d+: loss = ([\.\d]+)')
re_lr = re.compile('lr = ([\.\d]+)')
regex_iteration = re.compile('Iteration (\d+)')
regex_train_output = re.compile('Train net output #(\d+): (\S+) = ([\.\deE+-]+)')
regex_test_output = re.compile('Test net output #(\d+): (\S+) = ([\.\deE+-]+)')
regex_learning_rate = re.compile('lr = ([\.\d]+)')
# Pick out lines of interest
iteration = -1
test_accuracy = -1
learning_rate = float('NaN')
train_dict_list = []
test_dict_list = []
train_dict_names = ('NumIters', 'Seconds', 'TrainingLoss', 'LearningRate')
test_dict_names = ('NumIters', 'Seconds', 'TestAccuracy', 'TestLoss')
train_row = None
test_row = None
logfile_year = extract_seconds.get_log_created_year(path_to_log)
with open(path_to_log) as f:
start_time = extract_seconds.get_start_time(f, logfile_year)
for line in f:
iteration_match = re_iteration.search(line)
iteration_match = regex_iteration.search(line)
if iteration_match:
iteration = float(iteration_match.group(1))
if iteration == -1:
# Only look for other stuff if we've found the first iteration
# Only start parsing for other stuff if we've found the first
# iteration
continue
time = extract_seconds.extract_datetime_from_line(line,
logfile_year)
seconds = (time - start_time).total_seconds()
lr_match = re_lr.search(line)
if lr_match:
learning_rate = float(lr_match.group(1))
accuracy_match = re_accuracy.search(line)
if accuracy_match and get_line_type(line) == 'test':
test_accuracy = float(accuracy_match.group(1))
train_loss_match = re_train_loss.search(line)
if train_loss_match:
train_loss = float(train_loss_match.group(1))
train_dict_list.append({'NumIters': iteration,
'Seconds': seconds,
'TrainingLoss': train_loss,
'LearningRate': learning_rate})
output_loss_match = re_output_loss.search(line)
if output_loss_match and get_line_type(line) == 'test':
test_loss = float(output_loss_match.group(1))
# NOTE: we assume that (1) accuracy always comes right before
# loss for test data so the test_accuracy variable is already
# correctly populated and (2) there's one and only one output
# named "accuracy" for the test net
test_dict_list.append({'NumIters': iteration,
'Seconds': seconds,
'TestAccuracy': test_accuracy,
'TestLoss': test_loss})
return train_dict_list, train_dict_names, test_dict_list, test_dict_names
def save_csv_files(logfile_path, output_dir, train_dict_list, train_dict_names,
test_dict_list, test_dict_names, verbose=False):
learning_rate_match = regex_learning_rate.search(line)
if learning_rate_match:
learning_rate = float(learning_rate_match.group(1))
train_dict_list, train_row = parse_line_for_net_output(
regex_train_output, train_row, train_dict_list,
line, iteration, seconds, learning_rate
)
test_dict_list, test_row = parse_line_for_net_output(
regex_test_output, test_row, test_dict_list,
line, iteration, seconds, learning_rate
)
fix_initial_nan_learning_rate(train_dict_list)
fix_initial_nan_learning_rate(test_dict_list)
return train_dict_list, test_dict_list
def parse_line_for_net_output(regex_obj, row, row_dict_list,
line, iteration, seconds, learning_rate):
"""Parse a single line for training or test output
Returns a a tuple with (row_dict_list, row)
row: may be either a new row or an augmented version of the current row
row_dict_list: may be either the current row_dict_list or an augmented
version of the current row_dict_list
"""
output_match = regex_obj.search(line)
if output_match:
if not row or row['NumIters'] != iteration:
# Push the last row and start a new one
if row:
# If we're on a new iteration, push the last row
# This will probably only happen for the first row; otherwise
# the full row checking logic below will push and clear full
# rows
row_dict_list.append(row)
row = OrderedDict([
('NumIters', iteration),
('Seconds', seconds),
('LearningRate', learning_rate)
])
# output_num is not used; may be used in the future
# output_num = output_match.group(1)
output_name = output_match.group(2)
output_val = output_match.group(3)
row[output_name] = float(output_val)
if row and len(row_dict_list) >= 1 and len(row) == len(row_dict_list[0]):
# The row is full, based on the fact that it has the same number of
# columns as the first row; append it to the list
row_dict_list.append(row)
row = None
return row_dict_list, row
def fix_initial_nan_learning_rate(dict_list):
"""Correct initial value of learning rate
Learning rate is normally not printed until after the initial test and
training step, which means the initial testing and training rows have
LearningRate = NaN. Fix this by copying over the LearningRate from the
second row, if it exists.
"""
if len(dict_list) > 1:
dict_list[0]['LearningRate'] = dict_list[1]['LearningRate']
def save_csv_files(logfile_path, output_dir, train_dict_list, test_dict_list,
delimiter=',', verbose=False):
"""Save CSV files to output_dir
If the input log file is, e.g., caffe.INFO, the names will be
......@@ -108,18 +139,22 @@ def save_csv_files(logfile_path, output_dir, train_dict_list, train_dict_names,
log_basename = os.path.basename(logfile_path)
train_filename = os.path.join(output_dir, log_basename + '.train')
write_csv(train_filename, train_dict_list, train_dict_names, verbose)
write_csv(train_filename, train_dict_list, delimiter, verbose)
test_filename = os.path.join(output_dir, log_basename + '.test')
write_csv(test_filename, test_dict_list, test_dict_names, verbose)
write_csv(test_filename, test_dict_list, delimiter, verbose)
def write_csv(output_filename, dict_list, header_names, verbose=False):
def write_csv(output_filename, dict_list, delimiter, verbose=False):
"""Write a CSV file
"""
dialect = csv.excel
dialect.delimiter = delimiter
with open(output_filename, 'w') as f:
dict_writer = csv.DictWriter(f, header_names)
dict_writer = csv.DictWriter(f, fieldnames=dict_list[0].keys(),
dialect=dialect)
dict_writer.writeheader()
dict_writer.writerows(dict_list)
if verbose:
......@@ -141,16 +176,20 @@ def parse_args():
action='store_true',
help='Print some extra info (e.g., output filenames)')
parser.add_argument('--delimiter',
default=',',
help=('Column delimiter in output files '
'(default: \'%(default)s\')'))
args = parser.parse_args()
return args
def main():
args = parse_args()
train_dict_list, train_dict_names, test_dict_list, test_dict_names = \
parse_log(args.logfile_path)
train_dict_list, test_dict_list = parse_log(args.logfile_path)
save_csv_files(args.logfile_path, args.output_dir, train_dict_list,
train_dict_names, test_dict_list, test_dict_names)
test_dict_list, delimiter=args.delimiter)
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment