"""Utility script: selects instance attempts ("runs") for the main text figures/tables.
USAGE:
| python -m post_processing.select_runs \
| [--drop-larger-than N]
| -i <input_summary_full.csv> \
| -o <out_summary.csv>
Takes a summary file generated by the code in
:py:mod:`post_processing.logparser`, and selects one run (solution attempt,
corresponding to a run log file) per instance as follows:
- If we have feasible solutions, then take the most recent
run that yielded one,
- if we have no feasible solutions at all, pick the most recent run
that yielded an infeasible solution,
- otherwise, pick the most recent run out of those we have ("fail" runs).
- If the optional argument is given, it drops all instances with more than ``N``
variables from selection. (Needed for the IBM simulator dataset, where we drop
all instances of > 32 variables, which are known to fail in advance due to the
simulator restriction.)
The code is independent of the log type (DWave, IBM, or QuEra)
"""
import argparse
import pandas as pd
[docs]
def main():
"""Main script code."""
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_summary_file', type=str)
parser.add_argument('-o', '--output_summary_file', type=str)
parser.add_argument('-M', '--drop_larger_than', type=int, default=-1)
args = parser.parse_args()
infile = args.input_summary_file
outfile = args.output_summary_file
M = args.drop_larger_than
all_runs = pd.read_csv(infile)
if M > 0:
# drop all instances with QUBO size more than M variables
# Needed to remove failed runs for IBM simulator
# where we had > 32 variables
all_runs = all_runs.loc[all_runs.qubo_vars <= M,]
feas_runs = all_runs.loc[all_runs['obj_from_QPU_sol'].notnull(), ]
infeas_runs = all_runs.loc[(all_runs['success'].astype(str) == "True") & \
(all_runs['obj_from_QPU_sol'].isnull()),]
failed_runs = all_runs.loc[all_runs['success'].astype(str) != "True"]
# Check that did not loose any runs
assert len(all_runs.instance_id) == len(feas_runs) + \
len(infeas_runs) + \
len(failed_runs), f"Error parsing {infile}, not all runs are covered."
feas_ids = set(feas_runs.instance_id)
infeas_runs = infeas_runs.loc[~(infeas_runs.instance_id.isin(feas_ids))]
infeas_ids = set(infeas_runs.instance_id)
success_ids = feas_ids.union(infeas_ids)
failed_runs = failed_runs.loc[~(failed_runs.instance_id.isin(success_ids))]
feas_idx = feas_runs.groupby('instance_id')['start_timestamp'].idxmax()
infeas_idx = infeas_runs.groupby('instance_id')['start_timestamp'].idxmax()
failed_idx = failed_runs.groupby('instance_id')['start_timestamp'].idxmax()
feas_runs = feas_runs.loc[feas_idx,]
infeas_runs = infeas_runs.loc[infeas_idx,]
failed_runs = failed_runs.loc[failed_idx,]
assert len(set(all_runs.instance_id)) == len(feas_runs) + \
len(infeas_runs) + \
len(failed_runs)
latest_runs = pd.concat([feas_runs, infeas_runs, failed_runs],
ignore_index=True)
# A few cross-checks
assert set(latest_runs.instance_id) == set(all_runs.instance_id), \
f"Error parsing {infile}: not all instances are covered. Lost are: {set(latest_runs.insatnce_id) ^ set(all_runs.instance_id)}"
assert len(set(latest_runs.instance_id)) == len(latest_runs.instance_id), \
f"Error parsing {infile}: duplicate instances in latest_runs."
len(feas_runs) + \
len(infeas_runs) + \
len(failed_runs), f"Error parsing {infile}, not all instances covered."
assert len(feas_runs.instance_id) == len(set(feas_runs.instance_id)), \
f"Error: duplicates in feas_runs"
assert len(infeas_runs.instance_id) == len(set(infeas_runs.instance_id)), \
f"Error: duplicates in infeas_runs"
assert len(failed_runs.instance_id) == len(set(failed_runs.instance_id)), \
f"Error: duplicates in failed_runs"
latest_runs.to_csv(outfile, index=False)
print(f"{all_runs.shape[0] - latest_runs.shape[0]} runs removed.")
print(f"{outfile} covers {latest_runs.shape[0]} runs out of {all_runs.shape[0]}")
if __name__ == '__main__':
main()