|
61 | 61 | "# metadata I need to use to identify what exact image the features come from\n",
|
62 | 62 | "strata=[\"Image_Metadata_Well\", \"Image_Metadata_Plate\", \"Image_Metadata_Site\"]\n",
|
63 | 63 | "\n",
|
64 |
| - "run_info_dictionary = {\n", |
65 |
| - " \"SHSY5Y_first_run\": {\n", |
66 |
| - " \"sql_file\": \"SHSY5Y_cells_incomplete_first_run.sqlite\",\n", |
67 |
| - " \"image_features_output_file\": pathlib.Path(f\"{features_output_dir}/frist_run_image_quality.csv.gz\"),\n", |
| 64 | + "# set directory for sqlite files\n", |
| 65 | + "sqlite_dir = pathlib.Path(\n", |
| 66 | + " \"/scratch/alpine/mlippincott@xsede.org/sqlite_files\"\n", |
| 67 | + ").resolve(strict=True)\n", |
68 | 68 | "\n",
|
| 69 | + "# dictionary with info for the sqlite file from each run\n", |
| 70 | + "run_info_dictionary = {\n", |
| 71 | + " \"batch_1\": {\n", |
| 72 | + " # path to outputted SQLite file\n", |
| 73 | + " \"source_path\": str(\n", |
| 74 | + " pathlib.Path(\n", |
| 75 | + " f\"{sqlite_dir}/PBMC_batch_1.sqlite\"\n", |
| 76 | + " )\n", |
| 77 | + " ),\n", |
| 78 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_1_image_quality.parquet\")),\n", |
| 79 | + " },\n", |
| 80 | + " \"batch_2\": {\n", |
| 81 | + " # path to outputted SQLite file\n", |
| 82 | + " \"source_path\": str(\n", |
| 83 | + " pathlib.Path(\n", |
| 84 | + " f\"{sqlite_dir}/PBMC_batch_2.sqlite\"\n", |
| 85 | + " )\n", |
| 86 | + " ),\n", |
| 87 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_2_image_quality.parquet\")),\n", |
| 88 | + " },\n", |
| 89 | + " \"batch_3\": {\n", |
| 90 | + " # path to outputted SQLite file\n", |
| 91 | + " \"source_path\": str(\n", |
| 92 | + " pathlib.Path(\n", |
| 93 | + " f\"{sqlite_dir}/PBMC_batch_3.sqlite\"\n", |
| 94 | + " )\n", |
| 95 | + " ),\n", |
| 96 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_3.parquet\")),\n", |
| 97 | + " },\n", |
| 98 | + " \"batch_4\": {\n", |
| 99 | + " # path to outputted SQLite file\n", |
| 100 | + " \"source_path\": str(\n", |
| 101 | + " pathlib.Path(\n", |
| 102 | + " f\"{sqlite_dir}/PBMC_batch_4.sqlite\"\n", |
| 103 | + " )\n", |
| 104 | + " ),\n", |
| 105 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_4.parquet\")),\n", |
| 106 | + " },\n", |
| 107 | + " \"batch_5\": {\n", |
| 108 | + " # path to outputted SQLite file\n", |
| 109 | + " \"source_path\": str(\n", |
| 110 | + " pathlib.Path(\n", |
| 111 | + " f\"{sqlite_dir}/PBMC_batch_5.sqlite\"\n", |
| 112 | + " )\n", |
| 113 | + " ),\n", |
| 114 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_5.parquet\")),\n", |
69 | 115 | " },\n",
|
70 |
| - " \"SHSY5Y_second_run\": {\n", |
71 |
| - " \"sql_file\": \"SHSY5Y_cells_second_run.sqlite\",\n", |
72 |
| - " \"image_features_output_file\": pathlib.Path(f\"{features_output_dir}/second_run_image_quality.csv.gz\"),\n", |
| 116 | + " \"batch_6\": {\n", |
| 117 | + " # path to outputted SQLite file\n", |
| 118 | + " \"source_path\": str(\n", |
| 119 | + " pathlib.Path(\n", |
| 120 | + " f\"{sqlite_dir}/PBMC_batch_6.sqlite\"\n", |
| 121 | + " )\n", |
| 122 | + " ),\n", |
| 123 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_6.parquet\")),\n", |
73 | 124 | " },\n",
|
| 125 | + " \"batch_7\": {\n", |
| 126 | + " # path to outputted SQLite file\n", |
| 127 | + " \"source_path\": str(\n", |
| 128 | + " pathlib.Path(\n", |
| 129 | + " f\"{sqlite_dir}/PBMC_batch_7.sqlite\"\n", |
| 130 | + " )\n", |
| 131 | + " ),\n", |
| 132 | + " \"dest_path\": str(pathlib.Path(f\"{features_output_dir}/PBMC_batch_7.parquet\")),\n", |
| 133 | + " } \n", |
74 | 134 | "}\n"
|
75 | 135 | ]
|
76 | 136 | },
|
|
367 | 427 | "source": [
|
368 | 428 | "# read in SQLite Per_Image table as dataframe for each run\n",
|
369 | 429 | "## First run\n",
|
370 |
| - "sql_file_first_run = run_info_dictionary[\"SHSY5Y_first_run\"][\"sql_file\"]\n", |
| 430 | + "sql_file_first_run = run_info_dictionary[\"batch_1\"][\"source_path\"]\n", |
371 | 431 | "single_cell_file_first_run = f\"sqlite:///{cp_output_dir}/{sql_file_first_run}\"\n",
|
372 | 432 | "image_df_first_run = extract_utils.load_sqlite_as_df(\n",
|
373 | 433 | " sqlite_file_path=single_cell_file_first_run, image_table_name=\"Per_Image\"\n",
|
374 | 434 | ")\n",
|
| 435 | + "\n", |
375 | 436 | "## Second run\n",
|
376 |
| - "sql_file_second_run = run_info_dictionary[\"SHSY5Y_second_run\"][\"sql_file\"]\n", |
| 437 | + "sql_file_second_run = run_info_dictionary[\"batch_2\"][\"source_path\"]\n", |
377 | 438 | "single_cell_file_second_run = f\"sqlite:///{cp_output_dir}/{sql_file_second_run}\"\n",
|
378 | 439 | "image_df_second_run = extract_utils.load_sqlite_as_df(\n",
|
379 | 440 | " sqlite_file_path=single_cell_file_second_run, image_table_name=\"Per_Image\"\n",
|
380 | 441 | ")\n",
|
381 | 442 | "\n",
|
| 443 | + "## Third run\n", |
| 444 | + "sql_file_third_run = run_info_dictionary[\"batch_3\"][\"source_path\"]\n", |
| 445 | + "single_cell_file_third_run = f\"sqlite:///{cp_output_dir}/{sql_file_third_run}\"\n", |
| 446 | + "image_df_third_run = extract_utils.load_sqlite_as_df(\n", |
| 447 | + " sqlite_file_path=single_cell_file_third_run, image_table_name=\"Per_Image\"\n", |
| 448 | + ")\n", |
| 449 | + "\n", |
| 450 | + "## Fourth run\n", |
| 451 | + "sql_file_fourth_run = run_info_dictionary[\"batch_4\"][\"source_path\"]\n", |
| 452 | + "single_cell_file_fourth_run = f\"sqlite:///{cp_output_dir}/{sql_file_fourth_run}\"\n", |
| 453 | + "image_df_fourth_run = extract_utils.load_sqlite_as_df(\n", |
| 454 | + " sqlite_file_path=single_cell_file_fourth_run, image_table_name=\"Per_Image\"\n", |
| 455 | + ")\n", |
| 456 | + "\n", |
| 457 | + "## Fifth run\n", |
| 458 | + "sql_file_fifth_run = run_info_dictionary[\"batch_5\"][\"source_path\"]\n", |
| 459 | + "single_cell_file_fifth_run = f\"sqlite:///{cp_output_dir}/{sql_file_fifth_run}\"\n", |
| 460 | + "image_df_fifth_run = extract_utils.load_sqlite_as_df(\n", |
| 461 | + " sqlite_file_path=single_cell_file_fifth_run, image_table_name=\"Per_Image\"\n", |
| 462 | + ")\n", |
| 463 | + "\n", |
| 464 | + "## Sixth run\n", |
| 465 | + "sql_file_sixth_run = run_info_dictionary[\"batch_6\"][\"source_path\"]\n", |
| 466 | + "single_cell_file_sixth_run = f\"sqlite:///{cp_output_dir}/{sql_file_sixth_run}\"\n", |
| 467 | + "image_df_sixth_run = extract_utils.load_sqlite_as_df(\n", |
| 468 | + " sqlite_file_path=single_cell_file_sixth_run, image_table_name=\"Per_Image\"\n", |
| 469 | + ")\n", |
| 470 | + "\n", |
| 471 | + "## Seventh run\n", |
| 472 | + "sql_file_seventh_run = run_info_dictionary[\"batch_7\"][\"source_path\"]\n", |
| 473 | + "single_cell_file_seventh_run = f\"sqlite:///{cp_output_dir}/{sql_file_seventh_run}\"\n", |
| 474 | + "image_df_seventh_run = extract_utils.load_sqlite_as_df(\n", |
| 475 | + " sqlite_file_path=single_cell_file_seventh_run, image_table_name=\"Per_Image\"\n", |
| 476 | + ")\n", |
| 477 | + "\n", |
| 478 | + "\n", |
| 479 | + "\n", |
| 480 | + "\n", |
| 481 | + "\n", |
382 | 482 | "# merge the dataframes together into one combined run\n",
|
383 |
| - "SHSY5Y_run_df = pd.concat([image_df_first_run, image_df_second_run], ignore_index=True)\n", |
| 483 | + "PBMC_run_df = pd.concat([image_df_first_run, image_df_second_run, image_df_third_run, image_df_fourth_run, image_df_fifth_run, image_df_sixth_run, image_df_seventh_run], ignore_index=True)\n", |
384 | 484 | "\n",
|
385 |
| - "print(SHSY5Y_run_df.shape)\n", |
386 |
| - "SHSY5Y_run_df.head()" |
| 485 | + "print(PBMC_run_df.shape)\n", |
| 486 | + "PBMC_run_df.head()" |
387 | 487 | ]
|
388 | 488 | },
|
389 | 489 | {
|
|
408 | 508 | }
|
409 | 509 | ],
|
410 | 510 | "source": [
|
411 |
| - "# extract image quality features from merged SHSY5Y runs image table df\n", |
| 511 | + "# extract image quality features from merged PBMC runs image table df\n", |
412 | 512 | "image_features_df = extract_utils.extract_image_features(\n",
|
413 | 513 | " image_feature_categories=image_feature_categories,\n",
|
414 |
| - " image_df=SHSY5Y_run_df,\n", |
| 514 | + " image_df=PBMC_run_df,\n", |
415 | 515 | " image_cols=image_cols,\n",
|
416 | 516 | " strata=strata\n",
|
417 | 517 | ")\n",
|
|
427 | 527 | "# output df as parquet file\n",
|
428 | 528 | "output(\n",
|
429 | 529 | " df=annotated_image_features_df,\n",
|
430 |
| - " output_filename=pathlib.Path(f\"{features_output_dir}/plate2_SHSY5Y_image_features.parquet\"),\n", |
| 530 | + " output_filename=pathlib.Path(f\"{features_output_dir}/plate2_PBMC_image_features.parquet\"),\n", |
431 | 531 | " output_type='parquet',\n",
|
432 | 532 | ")\n",
|
433 |
| - "print(\"The image features for the SHSY5Y cells have been extracted and saved!\")" |
| 533 | + "print(\"The image features for the PBMC cells have been extracted and saved!\")" |
434 | 534 | ]
|
435 | 535 | },
|
436 | 536 | {
|
|
0 commit comments