-
Notifications
You must be signed in to change notification settings - Fork 1
/
analysis_python.py
executable file
·136 lines (111 loc) · 5.95 KB
/
analysis_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# Daniel B. Stribling
# Renne Lab, University of Florida
# Hybkit Project : http://www.github.com/RenneLab/hybkit
"""
Analysis for sample_grouped_target_analysis performed as a python workflow.
Provided as an example of direct usage of hybkit functions.
File names are hardcoded, and functions are accessed directly.
For more details, see: "grouped_target_analysis_notes.rst".
"""
import os
import sys
import hybkit
# Set count_mode:
# count_mode = 'read' # Count reads represented by each record, instead of number of records.
count_mode = 'record' # Count each record/line as one, unless record is combined.
# (Default count mode, but specified here for readability)
# Set script directories and input file names.
analysis_dir = os.path.abspath(os.path.dirname(__file__))
analysis_label = 'KSHV_Hyb_Combined'
out_dir = os.path.join(analysis_dir, 'output')
input_files = [
os.path.join(analysis_dir, 'GSM2720020_WT_BR1.hyb'),
os.path.join(analysis_dir, 'GSM2720021_WT_BR1.hyb'),
os.path.join(analysis_dir, 'GSM2720022_WT_BR1.hyb'),
os.path.join(analysis_dir, 'GSM2720023_D11_BR1.hyb'),
os.path.join(analysis_dir, 'GSM2720024_D11_BR1.hyb'),
os.path.join(analysis_dir, 'GSM2720025_D11_BR1.hyb')
]
out_file_path = os.path.join(analysis_dir, 'output', 'KSHV_Hyb_Combined.hyb')
match_legend_file = os.path.join(analysis_dir, 'string_match_legend.csv')
# Begin Analysis
print('\nPerforming Grouped Target Analysis...')
if not os.path.isdir(out_dir):
print('Creating Output Directory:\n %s\n' % out_dir)
os.mkdir(out_dir)
print('Analyzing Files:')
print(' ' + '\n '.join(input_files) + '\n')
# Set the method of finding segment type
match_parameters = hybkit.HybRecord.make_string_match_parameters(match_legend_file)
hybkit.HybRecord.select_find_type_method('string_match', match_parameters)
# Create custom list of miRNA types for analysis
mirna_types = list(hybkit.HybRecord.MIRNA_TYPES) + ['kshv_microRNA']
# Set hybrid segment types to remove as part of quality control (QC)
remove_types = ['rRNA', 'mitoch_rRNA']
# Begin Analysis
print('Outputting KSHV-Specific Hybrids to:\n %s\n' % out_file_path)
# Open out-file for writing, one output file will be used for all inputs.
with hybkit.HybFile(out_file_path, 'w') as out_kshv_file:
# Iterate through input files:
for in_file_path in input_files:
in_file_name = os.path.basename(in_file_path)
in_file_label = in_file_name.split('_')[0]
print('Analyzing:\n %s' % in_file_path)
with hybkit.HybFile(in_file_path, 'r') as in_file:
# Iterate over each record of the input file
for hyb_record in in_file:
# Analyze only sequences where a segment identifier contains the string "kshv"
if hyb_record.has_property('seg_contains', 'kshv'):
# Find the segment types of each record
hyb_record.set_flag('source', in_file_label)
hyb_record.find_seg_types()
hyb_record.mirna_analysis(mirna_types=mirna_types)
# Determine if record has type that is excluded
use_record = True
for remove_type in remove_types:
if hyb_record.has_property('seg_type', remove_type):
use_record = False
break
# If record has an excluded type, continue to next record without analyzing.
if not use_record:
continue
# Write the records to the output file
out_kshv_file.write_record(hyb_record)
print('\nPerforming Combined Target Analysis...\n')
# Reiterate over output HybFile and perform target analysis.
with hybkit.HybFile(out_file_path, 'r') as out_kshv_file:
# Prepare target analysis dict:
target_dict = {}
for hyb_record in out_kshv_file:
# Repeat .mirna_analysis, so that hyb_record object has associated metadata
hyb_record.mirna_analysis(mirna_types=mirna_types)
# Perform target-analysis of mirna within kshv-associated data.
hybkit.analysis.addto_mirna_target(hyb_record, target_dict,
count_mode=count_mode,
double_count_duplexes=True, # Includes mirna duplexes
# Limits output to KSHV miRNA:
mirna_contains='kshv')
# Process and sort dictionary of miRNA and targets
results = hybkit.analysis.process_mirna_target(target_dict)
(sorted_target_dict, # Contains same data as target_dict, but with keys sorted by count
counts_dict, # dict with keys: mirna_id, values: total mirna-specific hybrids
target_type_counts_dict, # dict with keys: mirna_id, values: dict of targeted type counts
total_count # integer: total number of mirna found.
) = results
# Write target information to output file
# Set analysis basename without ".hyb" extension
analysis_basename = out_file_path.replace('.hyb','')
print('Writing Analyses to files named after:\n %s\n' % analysis_basename)
hybkit.analysis.write_mirna_target(analysis_basename,
sorted_target_dict,
counts_dict,
target_type_counts_dict,
name=analysis_label,
multi_files=True, # Default
sep=',', # Default
file_suffix='.csv', # Default
spacer_line=True, # Default
make_plots=True, # Default
max_mirna=25) # Default is 10
print('Done\n')