1
+ import csv
2
+
3
+ input_file = "COI-5P.tax.tsv" # Your original TSV file
4
+ output_file = "converted_COI-5P.tax.tsv" # The new TSV file
5
+ input_row_count = 0
6
+ output_row_count = 0
7
+
8
+ with open (input_file , 'r' ) as infile , open (output_file , 'w' , newline = '' ) as outfile :
9
+ tsv_reader = csv .reader (infile , delimiter = '\t ' )
10
+ tsv_writer = csv .writer (outfile , delimiter = '\t ' )
11
+
12
+ for row in tsv_reader :
13
+ input_row_count += 1
14
+
15
+ if len (row ) < 2 :
16
+ print (f"Skipping malformed row #{ input_row_count } : { row } " )
17
+ continue
18
+
19
+ original_id = row [0 ]
20
+ taxonomic_path = row [1 ].split (';' )
21
+ smallest_taxonomic_level = None
22
+
23
+ # Reverse the taxonomic path and pick the first non-"NA" level
24
+ for taxon in reversed (taxonomic_path ):
25
+ if taxon != "NA" :
26
+ smallest_taxonomic_level = taxon
27
+ break
28
+
29
+ if smallest_taxonomic_level :
30
+ tsv_writer .writerow ([smallest_taxonomic_level , original_id ])
31
+ output_row_count += 1
32
+ else :
33
+ print (f"No valid taxonomic level found in row #{ input_row_count } " )
34
+
35
+ # print(f"Conversion complete. Processed {input_row_count} rows and wrote {output_row_count} rows.")
36
+ # print("The output is saved in", output_file)
37
+
38
+
39
+ # taxonkit name2taxid converted_COI-5P.tax.tsv > COI-5P.bold.tax
40
+ # awk -F "\t" '!seen[$2]++' COI-5P.bold.tax > COI-5P.bold.tax.unique; mv COI-5P.bold.tax.unique COI-5P.bold.tax
41
+ # cut -f2,3 COI-5P.bold.tax > replacement_map.txt; mv replacement_map.txt COI-5P.bold.tax
42
+
43
+
44
+
45
+ # 1) replace BOLD ID with Tax ID
46
+ def pair_lines (file ):
47
+ """Yield a pair of lines from the file at a time."""
48
+ line1 = next (file , None )
49
+ while line1 is not None :
50
+ line2 = next (file , "" )
51
+ yield (line1 .strip (), line2 .strip ())
52
+ line1 = next (file , None )
53
+
54
+ # Replace 'file1.txt', 'file2.txt', and 'file3.txt' with your actual file paths
55
+ with open ('COI-5P.bold.tax' , 'r' ) as file1 , \
56
+ open ('COI-5P.tax.tsv' , 'r' ) as file2 , \
57
+ open ('COI-5P.fasta' , 'r' ) as file3 :
58
+
59
+ with open ('COI-5P.tax.tsv_tmp' , 'w' ) as out2 , \
60
+ open ('COI-5P.fasta_tmp' , 'w' ) as out3 :
61
+
62
+ # Create an iterator for the third file
63
+ file3_pairs = pair_lines (file3 )
64
+
65
+ # Iterate through the files
66
+ for line1 , line2 , (part1 , part2 ) in zip (file1 , file2 , file3_pairs ):
67
+ # line1: entry from file1
68
+ # line2: entry from file2
69
+ # part1, part2: two-line entry from file3
70
+
71
+ # Process the lines
72
+ # Example: print them
73
+ #TODO: OUT2 (taxa): line1[1] + '\t' + line2[1]
74
+ #TODO: OUT3 (fasta): '>'+line1[1] + '\n' + part2
75
+ parts = line1 .strip ().split ('\t ' )
76
+ if len (parts ) < 2 :
77
+ continue
78
+ taxid = parts [1 ]
79
+ path = line2 .strip ().split ('\t ' )[1 ]
80
+ out2 .write (taxid + "\t " + path + '\n ' )
81
+ out3 .write ('>' + taxid + '\n ' )
82
+ out3 .write (part2 + '\n ' )
83
+
84
+ # Add your processing logic here
85
+
86
+ # mv COI-5P.fasta_tmp COI-5P.fasta
87
+ # mv COI-5P.tax.tsv_tmp COI-5P.tax.tsv
88
+
89
+ # 2) de-replicate on unique sequences
90
+ unique_sequences = {}
91
+ with open ('COI-5P.tax.tsv' , 'r' ) as file1 , \
92
+ open ('COI-5P.fasta' , 'r' ) as file2 :
93
+
94
+ with open ('COI-5P.tax.tsv_tmp' , 'w' ) as out1 , \
95
+ open ('COI-5P.fasta_tmp' , 'w' ) as out2 , \
96
+ open ('COI-5P_prune.txt' , 'w' ) as out3 :
97
+
98
+ # Create an iterator for the third file
99
+ fasta_pairs = pair_lines (file2 )
100
+
101
+ # Iterate through the files
102
+ for line1 , (part1 , part2 ) in zip (file1 , fasta_pairs ):
103
+ seq = part2 .strip ()
104
+ if seq not in unique_sequences :
105
+ out1 .write (line1 )
106
+ out2 .write (part1 + '\n ' )
107
+ out2 .write (part2 + '\n ' )
108
+ else :
109
+ out3 .write (part1 .lstrip (">" ) + '\n ' )
0 commit comments