Source code for post_processing.select_runs

"""Utility script: selects instance attempts ("runs") for the main text figures/tables.

USAGE:

    | python -m post_processing.select_runs  \
    |       [--drop-larger-than N]
    |       -i <input_summary_full.csv> \
    |       -o <out_summary.csv>

Takes a summary file generated by the code in
:py:mod:`post_processing.logparser`, and selects one run (solution attempt,
corresponding to a run log file) per instance as follows:

- If we have feasible solutions, then take the most recent
  run that yielded one,

- if we have no feasible solutions at all, pick the most recent run
  that yielded an infeasible solution,

- otherwise, pick the most recent run out of those we have ("fail" runs).

- If the optional argument is given, it drops all instances with more than ``N``
  variables from selection. (Needed for the IBM simulator dataset, where we drop
  all instances of > 32 variables, which are known to fail in advance due to the
  simulator restriction.)

The code is independent of the log type (DWave, IBM, or QuEra)

"""

import argparse
import pandas as pd


[docs]
def main():
    """Main script code."""
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_summary_file', type=str)
    parser.add_argument('-o', '--output_summary_file', type=str)
    parser.add_argument('-M', '--drop_larger_than', type=int, default=-1)

    args = parser.parse_args()
    infile = args.input_summary_file
    outfile = args.output_summary_file
    M = args.drop_larger_than

    all_runs = pd.read_csv(infile)

    if M > 0:
        # drop all instances with QUBO size more than M variables
        # Needed to remove failed runs for IBM simulator
        # where we had > 32 variables
        all_runs = all_runs.loc[all_runs.qubo_vars <= M,]

    feas_runs = all_runs.loc[all_runs['obj_from_QPU_sol'].notnull(), ]

    infeas_runs = all_runs.loc[(all_runs['success'].astype(str) == "True") & \
                               (all_runs['obj_from_QPU_sol'].isnull()),]

    failed_runs = all_runs.loc[all_runs['success'].astype(str) != "True"]

    # Check that did not loose any runs
    assert len(all_runs.instance_id) == len(feas_runs) + \
        len(infeas_runs) + \
        len(failed_runs), f"Error parsing {infile}, not all runs are covered."

    feas_ids = set(feas_runs.instance_id)
    infeas_runs = infeas_runs.loc[~(infeas_runs.instance_id.isin(feas_ids))]

    infeas_ids = set(infeas_runs.instance_id)

    success_ids = feas_ids.union(infeas_ids)

    failed_runs = failed_runs.loc[~(failed_runs.instance_id.isin(success_ids))]

    feas_idx = feas_runs.groupby('instance_id')['start_timestamp'].idxmax()
    infeas_idx = infeas_runs.groupby('instance_id')['start_timestamp'].idxmax()
    failed_idx = failed_runs.groupby('instance_id')['start_timestamp'].idxmax()

    feas_runs = feas_runs.loc[feas_idx,]
    infeas_runs = infeas_runs.loc[infeas_idx,]
    failed_runs = failed_runs.loc[failed_idx,]

    assert len(set(all_runs.instance_id)) == len(feas_runs) + \
        len(infeas_runs) + \
        len(failed_runs)

    latest_runs = pd.concat([feas_runs, infeas_runs, failed_runs],
                            ignore_index=True)

    # A few cross-checks
    assert set(latest_runs.instance_id) == set(all_runs.instance_id), \
        f"Error parsing {infile}: not all instances are covered. Lost are: {set(latest_runs.insatnce_id) ^ set(all_runs.instance_id)}"

    assert len(set(latest_runs.instance_id)) == len(latest_runs.instance_id), \
        f"Error parsing {infile}: duplicate instances in latest_runs."

    len(feas_runs) + \
        len(infeas_runs) + \
        len(failed_runs), f"Error parsing {infile}, not all instances covered."

    assert len(feas_runs.instance_id) == len(set(feas_runs.instance_id)), \
        f"Error: duplicates in feas_runs"

    assert len(infeas_runs.instance_id) == len(set(infeas_runs.instance_id)), \
        f"Error: duplicates in infeas_runs"

    assert len(failed_runs.instance_id) == len(set(failed_runs.instance_id)), \
        f"Error: duplicates in failed_runs"

    latest_runs.to_csv(outfile, index=False)
    print(f"{all_runs.shape[0] - latest_runs.shape[0]} runs removed.")
    print(f"{outfile} covers {latest_runs.shape[0]} runs out of {all_runs.shape[0]}")



if __name__ == '__main__':
    main()