Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance degradation compared to the previous psds eval #4

Open
VEOjiwon opened this issue Mar 29, 2023 · 3 comments
Open

Performance degradation compared to the previous psds eval #4

VEOjiwon opened this issue Mar 29, 2023 · 3 comments

Comments

@VEOjiwon
Copy link

Hello.

Problem :
Although this library was used, there is a problem that performance is lower in certain models compared to the existing psds score.
Please refer to the picture below for performance.
For performance implementation, related codes and materials are shared through the link below.

code for evaluation

from utils.evaluation_measures import (compute_per_intersection_macro_f1,
                                       compute_psds_from_operating_points,
                                       compute_psds_from_scores)
import sed_scores_eval


def test(train_cfg, configs, args, logger, eval=False, carbon_iter=False):
    if configs["generals"]["carbon"] and eval and carbon_iter:
        os.makedirs(os.path.join(configs["generals"]["save_folder"], "eval_codecarbon"), exist_ok=True)
        tracker_eval = EmissionsTracker("DCASE Task 4 SED EVALUATE",
                                    output_dir=os.path.join(configs["generals"]["save_folder"],
                                    "evaluation_codecarbon"))
        tracker_eval.start()
    if configs["generals"]["carbon"] and not eval and carbon_iter:
        os.makedirs(os.path.join(configs["generals"]["save_folder"], "devtest_codecarbon"), exist_ok=True)
        tracker_devtest = EmissionsTracker("DCASE Task 4 SED EVALUATE",
                                        output_dir=os.path.join(configs["generals"]["save_folder"],
                                        "devtest_codecarbon"))
        tracker_devtest.start()

    encoder = train_cfg["encoder"]
    psds_folders = train_cfg["psds_folders"]
    thresholds = np.arange(1 / (train_cfg["n_test_thresholds"] * 2), 1, 1 / train_cfg["n_test_thresholds"])
    train_cfg["net"].eval()
    train_cfg["ema_net"].eval()
    test_tsv, test_dur = train_cfg["test_tsvs"]
    test_scores_raw_buffer_student = {}
    test_scores_raw_buffer_teacher = {}
    test_scores_postprocessed_buffer_student = {}
    test_scores_postprocessed_buffer_teacher = {}
    
    with torch.no_grad():
        stud_test_psds_buffer = {k: pd.DataFrame() for k in thresholds}
        tch_test_psds_buffer = {k: pd.DataFrame() for k in thresholds}
        stud_test_psds_2023_buffer = {k: pd.DataFrame() for k in thresholds}
        tch_test_psds_2023_buffer = {k: pd.DataFrame() for k in thresholds}
        stud_test_f1_buffer = pd.DataFrame()
        tch_test_f1_buffer = pd.DataFrame()
        tk2 = tqdm(train_cfg["testloader"], total=len(train_cfg["testloader"]), leave=False, desc="test processing")
        for _, (wavs, labels, _, indexes, filenames, paths) in enumerate(tk2, 0):
            wavs, labels = wavs.to(train_cfg["device"]), labels.to(train_cfg["device"]) # labels size = [bs, n_class, frames]
            mels = train_cfg["feat_ext"](wavs)  # features size = [bs, freqs, frames]
            logmels = train_cfg["scaler"](take_log(mels))

            stud_preds, weak_stud_preds = train_cfg["net"](logmels)
            tch_preds, weak_tch_preds = train_cfg["ema_net"](logmels)

            stud_pred_dfs = decode_pred_batch(stud_preds, weak_stud_preds, paths, encoder,
                                              list(stud_test_psds_buffer.keys()), train_cfg["median_window"],
                                              train_cfg["decode_weak_test"])
            tch_pred_dfs = decode_pred_batch(tch_preds, weak_tch_preds, paths, encoder,
                                             list(tch_test_psds_buffer.keys()), train_cfg["median_window"],
                                             train_cfg["decode_weak_test"])
            for th in stud_test_psds_buffer.keys():
                stud_test_psds_buffer[th] = stud_test_psds_buffer[th].append(stud_pred_dfs[th], ignore_index=True)
            for th in tch_test_psds_buffer.keys():
                tch_test_psds_buffer[th] = tch_test_psds_buffer[th].append(tch_pred_dfs[th], ignore_index=True)
            stud_pred_df_halfpoint = decode_pred_batch(stud_preds, weak_stud_preds, paths, encoder, [0.5],
                                                       train_cfg["median_window"], train_cfg["decode_weak_test"])
            tch_pred_df_halfpoint = decode_pred_batch(tch_preds, weak_tch_preds, paths, encoder, [0.5],
                                                      train_cfg["median_window"], train_cfg["decode_weak_test"])
            stud_test_f1_buffer = stud_test_f1_buffer.append(stud_pred_df_halfpoint[0.5], ignore_index=True)
            tch_test_f1_buffer = tch_test_f1_buffer.append(tch_pred_df_halfpoint[0.5], ignore_index=True)
            
            (scores_raw_student_strong, scores_postprocessed_student_strong, _
            ) = batched_decode_preds(stud_preds, weak_stud_preds, paths, encoder, thresholds=list(stud_test_psds_2023_buffer.keys()) + [.5], 
                                     median_filter=train_cfg["median_window"], decode_weak = train_cfg["decode_weak_test"])
            
            test_scores_raw_buffer_student.update(scores_raw_student_strong)
            test_scores_postprocessed_buffer_student.update(scores_postprocessed_student_strong)

            # for th in stud_test_psds_2023_buffer.keys():
            #     stud_test_psds_2023_buffer[th] = pd.concat([stud_test_psds_2023_buffer[th], decoded_student_strong[th]], ignore_index=True)

            (scores_raw_teacher_strong, scores_postprocessed_teacher_strong, _
            ) = batched_decode_preds(tch_preds, weak_tch_preds, paths, encoder, thresholds=list(tch_test_psds_2023_buffer.keys()) + [.5], 
                                     median_filter=train_cfg["median_window"], decode_weak = train_cfg["decode_weak_test"])
            
            test_scores_raw_buffer_teacher.update(scores_raw_teacher_strong)
            test_scores_postprocessed_buffer_teacher.update(scores_postprocessed_teacher_strong)

    if configs["generals"]["carbon"] and eval and carbon_iter:
        tracker_eval.stop()
        eval_kwh = tracker_eval._total_energy.kwh
        logger.log_metrics({"/evaluate/tot_energy_kWh": torch.tensor(float(eval_kwh))})
        with open(os.path.join(configs["generals"]["save_folder"], "evaluation_codecarbon", "eval_tot_kwh.txt"), "w") as f:
            f.write(str(eval_kwh))

    ground_truth = sed_scores_eval.io.read_ground_truth_events(test_tsv)
    audio_durations = sed_scores_eval.io.read_audio_durations(test_dur)
    ground_truth = {
        audio_id: gt for audio_id, gt in ground_truth.items()
        if len(gt) > 0
    }
    audio_durations = {
        audio_id: audio_durations[audio_id]
        for audio_id in ground_truth.keys()
    }
    
    # calculate psds
    psds1_kwargs = {"dtc_threshold": 0.7, "gtc_threshold": 0.7, "alpha_ct": 0, "alpha_st": 1}
    psds2_kwargs = {"dtc_threshold": 0.1, "gtc_threshold": 0.1, "cttc_threshold": 0.3, "alpha_ct": 0.5, "alpha_st": 1}
    stud_psds1 = compute_psds_from_operating_points(stud_test_psds_buffer, test_tsv, test_dur, save_dir=psds_folders[0],
                                                    **psds1_kwargs)
    stud_psds2 = compute_psds_from_operating_points(stud_test_psds_buffer, test_tsv, test_dur, save_dir=psds_folders[0],
                                                    **psds2_kwargs)
    tch_psds1 = compute_psds_from_operating_points(tch_test_psds_buffer, test_tsv, test_dur, save_dir=psds_folders[1],
                                                   **psds1_kwargs)
    tch_psds2 = compute_psds_from_operating_points(tch_test_psds_buffer, test_tsv, test_dur, save_dir=psds_folders[1],
                                                   **psds2_kwargs)
    
    
    stud_psds1_sed_scores_eval = compute_psds_from_scores(test_scores_postprocessed_buffer_student, ground_truth, audio_durations, dtc_threshold=0.7, 
                                                          gtc_threshold=0.7, cttc_threshold=None, alpha_ct=0, alpha_st=1, save_dir=psds_folders[0])
    stud_psds2_sed_scores_eval = compute_psds_from_scores(test_scores_postprocessed_buffer_student, ground_truth, audio_durations, save_dir=psds_folders[0],
                                                          **psds2_kwargs)
    tch_psds1_sed_scores_eval = compute_psds_from_scores(test_scores_postprocessed_buffer_teacher, ground_truth, audio_durations, dtc_threshold=0.7, 
                                                          gtc_threshold=0.7, cttc_threshold=None, alpha_ct=0, alpha_st=1, save_dir=psds_folders[0])
    tch_psds2_sed_scores_eval = compute_psds_from_scores(test_scores_postprocessed_buffer_teacher, ground_truth, audio_durations, save_dir=psds_folders[1],
                                                        **psds2_kwargs)
    
    s_evt_ma_f1, s_evt_mi_f1, s_seg_ma_f1, s_seg_mi_f1 = log_sedeval_metrics(stud_test_f1_buffer,
                                                                             test_tsv, psds_folders[0])
    s_inter_f1 = compute_per_intersection_macro_f1({"0.5": stud_test_f1_buffer}, test_tsv, test_dur)
    t_evt_ma_f1, t_evt_mi_f1, t_seg_ma_f1, t_seg_mi_f1 = log_sedeval_metrics(tch_test_f1_buffer,
                                                                             test_tsv, psds_folders[1])
    t_inter_f1 = compute_per_intersection_macro_f1({"0.5": tch_test_f1_buffer}, test_tsv, test_dur)
    
    if configs["generals"]["carbon"] and not eval and carbon_iter:
        tracker_devtest.stop()
        eval_kwh = tracker_devtest._total_energy.kwh
        logger.log_metrics({"/evaluate/tot_energy_kWh": torch.tensor(float(eval_kwh))})
        with open(os.path.join(configs["generals"]["save_folder"], "evaluation_codecarbon", "eval_tot_kwh.txt"), "w") as f:
            f.write(str(eval_kwh))

스크린샷 2023-03-29 오후 1 35 43

scores : https://drive.google.com/file/d/1i_OPaFSQKnZZH6kQZr5cqxTdNsapn0mh/view?usp=share_link

@JanekEbb
Copy link
Collaborator

JanekEbb commented Mar 30, 2023

Hi,

I have tested sed_scores_eval on the teacher scores you provided as follows

scores_dir = Path("/path/to/your/provided/scores_dir")
psds, psd_roc, single_class_psd_rocs = intersection_based.psds(
    scores=scores_dir/"psds_teacher"/"scores",
    ground_truth=scores_dir/"public.tsv",
    audio_durations=scores_dir/"public_duration.tsv",
    dtc_threshold=.7,
    gtc_threshold=.7,
    cttc_threshold=None,
    alpha_ct=0,
    alpha_st=1.,
    unit_of_time='hour',
    max_efpr=100.,
    num_jobs=3,
    time_decimals=6,
)
print(psds)

which gives a psds of 0.5054 as in you screenshot.

Running the reference implementation based on psds_eval (that is used for testing) as follows

(
    psds_ref, psd_roc_ref, single_class_psd_rocs_ref
) = intersection_based.reference.approximate_psds(
    scores=scores_dir/"psds_teacher"/"scores",
    ground_truth=scores_dir/"public.tsv",
    audio_durations=scores_dir/"public_duration.tsv",
    thresholds=np.linspace(0.01,.99,50),
    dtc_threshold=.7,
    gtc_threshold=.7,
    cttc_threshold=None,
    alpha_ct=0,
    alpha_st=1.,
    unit_of_time='hour',
    max_efpr=100.,
)
print(psds_ref)

gives a psds_ref of 0.4956 which doesn't match your reported value.

I am wondering if the reason for the difference might be in the way you are performing the decoding before computing psds from operating points. In your code (that I cannot execute btw) I see that you use decode_pred_batch to obtain the ops for compute_psds_from_operating_points whereas scores for the compute_psds_from_scores call come from batched_decode_preds.

Can you maybe provide a function (incl dependencies so I can execute it) where your reported results are both generated from the same scores that you provided?

@VEOjiwon
Copy link
Author

VEOjiwon commented Apr 6, 2023

I use same code at dcase baseline and FDY-SED github (https://github.com/frednam93/FDY-SED)

this used codes

def batched_decode_preds(strong_preds, weak_preds, filenames, encoder, thresholds, median_filter, decode_weak, pad_indx=None):
    # Init a dataframe per threshold
    scores_raw = {}
    scores_postprocessed = {}
    prediction_dfs = {}
    for threshold in thresholds:
        prediction_dfs[threshold] = pd.DataFrame()

    for j in range(strong_preds.shape[0]):  # over batches
        audio_id = Path(filenames[j]).stem
        filename = audio_id + ".wav"
        c_scores = strong_preds[j]
        # pdb.set_trace()
        if pad_indx is not None:
            true_len = int(c_scores.shape[-1] * pad_indx[j].item())
            c_scores = c_scores[:true_len]
        
        
        c_scores = c_scores.transpose(0, 1).detach().cpu().numpy()
        
        c_scores_org = copy.deepcopy(c_scores)
        scores_raw[audio_id] = create_score_dataframe(
            scores=c_scores,
            timestamps=encoder._frame_to_time(np.arange(len(c_scores)+1)),
            event_classes=encoder.labels,
        )
        for mf_idx in range(len(median_filter)):
            c_scores[:, mf_idx] = scipy.ndimage.filters.median_filter(c_scores[:, mf_idx], (median_filter[mf_idx]))
        scores_postprocessed[audio_id] = create_score_dataframe(
            scores=c_scores,
            timestamps=encoder._frame_to_time(np.arange(len(c_scores)+1)),
            event_classes=encoder.labels,
        )
        for c_th in thresholds:
            if decode_weak: # if decode_weak = 1 or 2
                for class_idx in range(weak_preds.size(1)):
                    if weak_preds[j, class_idx] < c_th:
                        c_scores_org[:, class_idx] = 0
                    elif decode_weak > 1: # use only weak predictions (weakSED)
                        c_scores_org[:, class_idx] = 1
            if decode_weak < 2: # weak prediction masking
                c_scores_org = c_scores_org > c_th
                for mf_idx in range(len(median_filter)):
                    c_scores_org[:, mf_idx] = scipy.ndimage.filters.median_filter(c_scores_org[:, mf_idx], (median_filter[mf_idx]))
            pred = encoder.decode_strong(c_scores_org)
            pred = pd.DataFrame(pred, columns=["event_label", "onset", "offset"])
            pred["filename"] = filename
            prediction_dfs[c_th] = pd.concat([prediction_dfs[c_th], pred], ignore_index=True)

    return scores_raw, scores_postprocessed, prediction_dfs

def decode_pred_batch(outputs, weak_preds, filenames, encoder, thresholds, median_filter, decode_weak, pad_idx=None):
    pred_dfs = {}
    for threshold in thresholds:
        pred_dfs[threshold] = pd.DataFrame()
    for batch_idx in range(outputs.shape[0]): #outputs size = [bs, n_class, frames]
        for c_th in thresholds:
            output = outputs[batch_idx]       #outputs size = [n_class, frames]
            if pad_idx is not None:
                true_len = int(output.shape[-1] * pad_idx[batch_idx].item)
                output = output[:true_len]
            output = output.transpose(0, 1).detach().cpu().numpy() #output size = [frames, n_class]
            if decode_weak: # if decode_weak = 1 or 2
                for class_idx in range(weak_preds.size(1)):
                    if weak_preds[batch_idx, class_idx] < c_th:
                        output[:, class_idx] = 0
                    elif decode_weak > 1: # use only weak predictions (weakSED)
                        output[:, class_idx] = 1
            if decode_weak < 2: # weak prediction masking
                output = output > c_th
                for mf_idx in range(len(median_filter)):
                    output[:, mf_idx] = scipy.ndimage.filters.median_filter(output[:, mf_idx], (median_filter[mf_idx]))
            pred = encoder.decode_strong(output)
            pred = pd.DataFrame(pred, columns=["event_label", "onset", "offset"])
            pred["filename"] = Path(filenames[batch_idx]).stem + ".wav"
            pred_dfs[c_th] = pred_dfs[c_th].append(pred, ignore_index=True)
    return pred_dfs

@JanekEbb
Copy link
Collaborator

JanekEbb commented Apr 6, 2023

Hi,

so if you use different methods for post-processing you cannot expect the results to be comparable. If you think, however, the two functions are doing the same post-processing, then please provide a minimal working example where scores with the same post-processing give a psds1_2023 that is lower than psds1_2022. Otherwise I cannot see if there really is an issue here.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants