more comments

GeoscienceAustralia · May 18, 2021 · c78da89 · c78da89
1 parent ffa97fc
commit c78da89
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 10 deletions.
diff --git a/ptha_access/get_PTHA_results.R b/ptha_access/get_PTHA_results.R
@@ -1162,7 +1162,8 @@ estimate_exrate_uncertainty<-function(random_scenarios, event_peak_stage, thresh
 
     local_fun<-function(mw){ 
         .get_mean_and_variance_of_exrate_in_mw_bin_from_random_scenarios(mw, random_scenarios, 
-            event_peak_stage, threshold_stage, importance_sampling_type)}
+            event_peak_stage, threshold_stage, importance_sampling_type)
+    }
 
     all_results = lapply(unique_Mw, function(x) local_fun(x))
 
@@ -1216,7 +1217,7 @@ estimate_exrate_uncertainty<-function(random_scenarios, event_peak_stage, thresh
 #' conditional probability of sampling each scenario (within its magnitude-bin)
 #' is proportional to event_rates. If provided then this gives the conditional
 #' probability, and we compute the variance assuming that basic importance
-#' sampling is used to re-weight the scenarios.
+#' sampling is used to re-weight the scenarios, using the asymptotic variance formula.
 #' @param TOL A numerical tolerance used for checking if magnitude-bins are
 #' equally spaced. It should be much smaller than the magnitude-bin size (0.1
 #' in PTHA18), but allow for minor floating-point variations in event_Mw.
@@ -1252,11 +1253,15 @@ get_optimal_number_of_samples_per_Mw<-function(event_Mw, event_rates,
 
         k = which(abs(event_Mw - unique_event_Mw[i]) < TOL)
 
-        if(sum(event_rates[k]) == 0){
+        if(sum(event_rates[k]) == 0 | sum(event_importance_weighted_sampling_probs[k]) == 0){
 
             variance_numerator[i] = 0
 
         }else{
+            # 'Basic-importance-sampling' variance. For the case with:
+            #     event_importance_weighted_sampling_probs = event_rates
+            # this gives the same result as regular stratified sampling, so
+            # we do not add a separate implementation.
 
             # Bin-specific weights with standard stratified sampling
             scenario_wts_no_importance = event_rates[k] / sum(event_rates[k])
@@ -1278,6 +1283,10 @@ get_optimal_number_of_samples_per_Mw<-function(event_Mw, event_rates,
             variance_numerator[i] = sum( scenario_wts_importance * 
                 (rate_of_Mw * (event_peak_stage[k] > stage_threshold) * basic_importance_sampling_weights -
                  true_exceedance_rate)**2 )
+            #variance_numerator[i] = sum( scenario_wts_importance * 
+            #    (rate_of_Mw * (event_peak_stage[k] > stage_threshold) * basic_importance_sampling_weights)^2) -
+            #     true_exceedance_rate^2
+
         }
     }
 

diff --git a/ptha_access/test_all.R b/ptha_access/test_all.R
@@ -421,10 +421,10 @@ test_random_scenario_sampling<-function(){
     #
     # Test our estimates of the optimal number of scenarios to sample in each Mw bin
     #
-
+    nrand_samples = sum(!is.na(random_scenarios_repeated$inds))
     # Simple check of variance_numerator by back-calculation
     t0 = ptha18$get_optimal_number_of_samples_per_Mw(event_Mw, event_rates, event_peak_stage,
-        stage_threshold=threshold_stage, total_samples=nrow(random_scenarios_repeated))
+        stage_threshold=threshold_stage, total_samples=nrand_samples)
     rate_with_this_Mw = aggregate(event_rates, by=list(event_Mw), sum)$x
     rate_exceeding_with_this_Mw = aggregate(event_rates * (event_peak_stage > threshold_stage), 
         by=list(event_Mw), sum)$x
@@ -442,22 +442,25 @@ test_random_scenario_sampling<-function(){
     # Check that the theoretical variance (assuming we sample per Mw-bin like
     # in random_scenarios_repeated) is close to the empirical variances (from
     # the above repeated random sampling). This exercises importance-sampling as well
-    # as unequal magnitude stratification.
+    # as unequal magnitude stratification. Note that for importance-sampling the variance
+    # is asymptotic (so it is not exact in finite samples)
     #
     t0 = ptha18$get_optimal_number_of_samples_per_Mw(event_Mw, event_rates, event_peak_stage,
-        stage_threshold=threshold_stage, total_samples=nrow(random_scenarios_repeated),
+        stage_threshold=threshold_stage, total_samples=nrand_samples,
         event_importance_weighted_sampling_probs=(event_peak_stage*event_rates))
-    repeated_random_scenario_counts = as.numeric(table(random_scenarios_repeated$mw))
-    expected_variance = sum(t0$variance_numerator/repeated_random_scenario_counts)
+    repeated_random_scenario_counts = aggregate(!is.na(random_scenarios_repeated$inds), 
+        by=list(random_scenarios_repeated$mw), sum)$x
+    expected_variance = sum(t0$variance_numerator/repeated_random_scenario_counts, na.rm=TRUE)
     empirical_variance = var(est_sd_store[['basic']][,1])
-    # Check they agree within a few percent
+    # Check they agree within a few percent (the estimate is not exact in finite-samples)
     if(abs(empirical_variance - expected_variance) < 0.03*expected_variance){
         print('PASS')
     }else{
         print('FAIL -- the theoretical variance is not close enough to the sampling variance')
     }
 
     # Check that the typical 'sample-sd-estimate' is reasonably close to the expected value
+    # (again note the estimate is not exact in finite samples)
     typical_sd_estimate = median(est_sd_store[['basic']][,2])
     expected_sd = sqrt(expected_variance)
     if( abs(typical_sd_estimate - expected_sd) < 0.05*expected_sd ){