Skip to content

Commit 2fe7668

Browse files
committed
scripts used for bold
1 parent 303c8b9 commit 2fe7668

File tree

3 files changed

+267
-0
lines changed

3 files changed

+267
-0
lines changed

misc/bold-fix-fasta.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import os
2+
import argparse
3+
4+
parser = argparse.ArgumentParser(description='')
5+
parser.add_argument('--output', type=str)
6+
parser.add_argument('--input', type=str)
7+
parser.add_argument('--taxa', type=str)
8+
parser.add_argument('--log', type=str)
9+
args = parser.parse_args()
10+
11+
12+
filepath = args.input
13+
output = args.output
14+
taxonomy = args.taxa
15+
logs=args.log
16+
17+
info_dict = {}
18+
# get largest seq per ntid
19+
with open(filepath) as input:
20+
counter = 0
21+
for line in input:
22+
if(line.startswith(">")): # awk converts tabs to spaces
23+
ntid=line.lstrip(">").strip()
24+
else:
25+
length = len(line.strip())
26+
index = counter
27+
counter += 1
28+
if length < 100:
29+
continue
30+
try:
31+
if length > info_dict[ntid]['length']:
32+
info_dict[ntid] = { 'length': length,
33+
'index': index,
34+
'filename': filepath}
35+
except KeyError as e:
36+
info_dict[ntid] = { 'length': length,
37+
'index': index,
38+
'filename': filepath}
39+
40+
with open(output, 'a') as out:
41+
with open(filepath, 'r') as input:
42+
counter = 0
43+
for line in input:
44+
if(line.startswith(">")):
45+
ntid=line.lstrip(">").rstrip()
46+
else:
47+
seq=line.strip()
48+
try:
49+
if counter == info_dict[ntid]['index'] and ntid != "*":
50+
out.writelines('>' + ntid + '\n')
51+
out.writelines(seq + '\n')
52+
else:
53+
with open(logs, 'a+') as logfile:
54+
logfile.writelines(ntid + '\n')
55+
except KeyError as e:
56+
with open(logs, 'a+') as logfile:
57+
logfile.writelines(ntid + '\n')
58+
counter += 1
59+
60+
61+
with open(f"{taxonomy}_new", 'a') as out:
62+
with open(taxonomy, 'r') as input:
63+
counter = 0
64+
for line in input:
65+
linespl = line.split('\t')
66+
ntid = linespl[0]
67+
try:
68+
if counter == info_dict[ntid]['index'] and ntid != "*":
69+
tax_path = linespl[1].strip().replace(",", ";").replace("None","NA")
70+
out.writelines(ntid + '\t' + tax_path + '\n')
71+
else:
72+
with open(logs, 'a+') as logfile:
73+
logfile.writelines(ntid + '\n')
74+
except KeyError as e:
75+
with open(logs, 'a+') as logfile:
76+
logfile.writelines(ntid + '\n')
77+
counter += 1
78+
79+
80+
# change domain: sed -i 's/^\([^\t]*\)\t[^\t;]*;/\1\tEukaryota;/' COI-5P.tax.tsv
81+
# remove uncultured on fasta
82+
# awk '
83+
# > # Load IDs into an associative array
84+
# > NR==FNR {ids[$0]=1; next}
85+
# >
86+
# > # If the line starts with ">", check against the IDs
87+
# > /^>/ {
88+
# > # Extract the ID from the FASTA header
89+
# > split($0, a, /[>|]/);
90+
# > fasta_id=a[2];
91+
# >
92+
# > # Determine if this entry should be printed
93+
# > printit = !(fasta_id in ids)
94+
# > }
95+
# >
96+
# > # Print the line if printit is true
97+
# > {if(printit) print}
98+
# > ' "$id_file" "$fasta_file" > "$temp_fasta"

misc/bold2ednaexplorer.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#! /bin/bash
2+
3+
# loop through fasta
4+
# 1) check if marker = COI-5P
5+
# 2) parse taxon path:
6+
# ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
7+
# [0, 1, 2, 3, 4, 6, 7]
8+
9+
fasta_file="bold.fasta"
10+
fasta_out="Fungi.fasta"
11+
taxa_out="Fungi.tax.tsv"
12+
13+
process_sequence() {
14+
IFS='|' read -r id marker country taxonomy rest <<< "$current_header"
15+
# Converting taxonomy to an array
16+
IFS=',' read -r -a taxonomy_array <<< "$taxonomy"
17+
# Removing subfamily and subspecies
18+
unset taxonomy_array[5]
19+
unset taxonomy_array[-1]
20+
fungi_list=("Ascomycota" "Basidiomycota" "Chytridiomycota" "Glomeromycota" "Myxomycota" "Zygomycota")
21+
# Loop through the list and check each item
22+
for phylum in "${fungi_list[@]}"; do
23+
if [[ "$phylum" == "${taxonomy_array[1]}" ]]; then
24+
# Convert the array back to a string, joined by commas
25+
IFS=',' new_taxonomy=$(printf "%s," "${taxonomy_array[@]}")
26+
new_taxonomy=${new_taxonomy%,} # Removing the trailing comma
27+
28+
# remove leading char
29+
id="${id#>}"
30+
31+
# append to out files
32+
echo ">$id" >> $fasta_out
33+
echo $current_sequence >> $fasta_out
34+
echo -e "$id\t$new_taxonomy" >> "$taxa_out"
35+
break
36+
fi
37+
done
38+
}
39+
40+
# Read the FASTA file line by line
41+
while IFS= read -r line
42+
do
43+
if [[ $line == ">"* ]]; then
44+
# Process the previous sequence
45+
if [ -n "$current_header" ]; then
46+
process_sequence
47+
fi
48+
# Update the current header and reset the sequence
49+
current_header=$line
50+
current_sequence=""
51+
else
52+
# Append the line to the current sequence
53+
current_sequence+=$line
54+
fi
55+
done < "$fasta_file"
56+
57+
# Process the last sequence in the file
58+
if [ -n "$current_header" ]; then
59+
process_sequence
60+
fi

misc/bold2taxid.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import csv
2+
3+
input_file = "COI-5P.tax.tsv" # Your original TSV file
4+
output_file = "converted_COI-5P.tax.tsv" # The new TSV file
5+
input_row_count = 0
6+
output_row_count = 0
7+
8+
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
9+
tsv_reader = csv.reader(infile, delimiter='\t')
10+
tsv_writer = csv.writer(outfile, delimiter='\t')
11+
12+
for row in tsv_reader:
13+
input_row_count += 1
14+
15+
if len(row) < 2:
16+
print(f"Skipping malformed row #{input_row_count}: {row}")
17+
continue
18+
19+
original_id = row[0]
20+
taxonomic_path = row[1].split(';')
21+
smallest_taxonomic_level = None
22+
23+
# Reverse the taxonomic path and pick the first non-"NA" level
24+
for taxon in reversed(taxonomic_path):
25+
if taxon != "NA":
26+
smallest_taxonomic_level = taxon
27+
break
28+
29+
if smallest_taxonomic_level:
30+
tsv_writer.writerow([smallest_taxonomic_level, original_id])
31+
output_row_count += 1
32+
else:
33+
print(f"No valid taxonomic level found in row #{input_row_count}")
34+
35+
# print(f"Conversion complete. Processed {input_row_count} rows and wrote {output_row_count} rows.")
36+
# print("The output is saved in", output_file)
37+
38+
39+
# taxonkit name2taxid converted_COI-5P.tax.tsv > COI-5P.bold.tax
40+
# awk -F "\t" '!seen[$2]++' COI-5P.bold.tax > COI-5P.bold.tax.unique; mv COI-5P.bold.tax.unique COI-5P.bold.tax
41+
# cut -f2,3 COI-5P.bold.tax > replacement_map.txt; mv replacement_map.txt COI-5P.bold.tax
42+
43+
44+
45+
# 1) replace BOLD ID with Tax ID
46+
def pair_lines(file):
47+
"""Yield a pair of lines from the file at a time."""
48+
line1 = next(file, None)
49+
while line1 is not None:
50+
line2 = next(file, "")
51+
yield (line1.strip(), line2.strip())
52+
line1 = next(file, None)
53+
54+
# Replace 'file1.txt', 'file2.txt', and 'file3.txt' with your actual file paths
55+
with open('COI-5P.bold.tax', 'r') as file1, \
56+
open('COI-5P.tax.tsv', 'r') as file2, \
57+
open('COI-5P.fasta', 'r') as file3:
58+
59+
with open('COI-5P.tax.tsv_tmp', 'w') as out2, \
60+
open('COI-5P.fasta_tmp', 'w') as out3:
61+
62+
# Create an iterator for the third file
63+
file3_pairs = pair_lines(file3)
64+
65+
# Iterate through the files
66+
for line1, line2, (part1, part2) in zip(file1, file2, file3_pairs):
67+
# line1: entry from file1
68+
# line2: entry from file2
69+
# part1, part2: two-line entry from file3
70+
71+
# Process the lines
72+
# Example: print them
73+
#TODO: OUT2 (taxa): line1[1] + '\t' + line2[1]
74+
#TODO: OUT3 (fasta): '>'+line1[1] + '\n' + part2
75+
parts=line1.strip().split('\t')
76+
if len(parts) < 2:
77+
continue
78+
taxid=parts[1]
79+
path=line2.strip().split('\t')[1]
80+
out2.write(taxid + "\t" + path + '\n')
81+
out3.write('>' + taxid + '\n')
82+
out3.write(part2 + '\n')
83+
84+
# Add your processing logic here
85+
86+
# mv COI-5P.fasta_tmp COI-5P.fasta
87+
# mv COI-5P.tax.tsv_tmp COI-5P.tax.tsv
88+
89+
# 2) de-replicate on unique sequences
90+
unique_sequences = {}
91+
with open('COI-5P.tax.tsv', 'r') as file1, \
92+
open('COI-5P.fasta', 'r') as file2:
93+
94+
with open('COI-5P.tax.tsv_tmp', 'w') as out1, \
95+
open('COI-5P.fasta_tmp', 'w') as out2, \
96+
open('COI-5P_prune.txt', 'w') as out3:
97+
98+
# Create an iterator for the third file
99+
fasta_pairs = pair_lines(file2)
100+
101+
# Iterate through the files
102+
for line1, (part1, part2) in zip(file1, fasta_pairs):
103+
seq=part2.strip()
104+
if seq not in unique_sequences:
105+
out1.write(line1)
106+
out2.write(part1 + '\n')
107+
out2.write(part2 + '\n')
108+
else:
109+
out3.write(part1.lstrip(">") + '\n')

0 commit comments

Comments
 (0)