scripts used for bold

hector-baez · hector-baez · commit 2fe7668fbb71 · 2024-01-30T15:19:50.000-06:00
diff --git a/misc/bold-fix-fasta.py b/misc/bold-fix-fasta.py
@@ -0,0 +1,98 @@
+import os
+import argparse
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument('--output', type=str)
+parser.add_argument('--input', type=str)
+parser.add_argument('--taxa', type=str)
+parser.add_argument('--log', type=str)
+args = parser.parse_args()
+
+
+filepath = args.input
+output = args.output
+taxonomy = args.taxa
+logs=args.log
+
+info_dict = {}
+# get largest seq per ntid
+with open(filepath) as input:
+    counter = 0
+    for line in input:
+        if(line.startswith(">")): # awk converts tabs to spaces
+            ntid=line.lstrip(">").strip()
+        else:
+            length = len(line.strip())
+            index = counter
+            counter += 1
+            if length < 100:
+                continue
+            try:
+                if length > info_dict[ntid]['length']:
+                    info_dict[ntid] = { 'length': length,
+                                        'index': index,
+                                        'filename': filepath}
+            except KeyError as e:
+                    info_dict[ntid] = { 'length': length,
+                                        'index': index,
+                                        'filename': filepath}
+
+with open(output, 'a') as out:
+    with open(filepath, 'r') as input:
+            counter = 0
+            for line in input:
+                if(line.startswith(">")):
+                    ntid=line.lstrip(">").rstrip()
+                else:
+                    seq=line.strip()
+                    try:
+                        if counter == info_dict[ntid]['index'] and ntid != "*":
+                            out.writelines('>' + ntid + '\n')
+                            out.writelines(seq + '\n')
+                        else:
+                            with open(logs, 'a+') as logfile:
+                                logfile.writelines(ntid + '\n')
+                    except KeyError as e:
+                        with open(logs, 'a+') as logfile:
+                                logfile.writelines(ntid + '\n')
+                    counter += 1
+
+
+with open(f"{taxonomy}_new", 'a') as out:
+    with open(taxonomy, 'r') as input:
+        counter = 0
+        for line in input:
+            linespl = line.split('\t')
+            ntid = linespl[0]
+            try:
+                if counter == info_dict[ntid]['index'] and ntid != "*":
+                    tax_path = linespl[1].strip().replace(",", ";").replace("None","NA")
+                    out.writelines(ntid + '\t' + tax_path + '\n')
+                else:
+                    with open(logs, 'a+') as logfile:
+                        logfile.writelines(ntid + '\n')
+            except KeyError as e:
+                with open(logs, 'a+') as logfile:
+                        logfile.writelines(ntid + '\n')
+            counter += 1
+
+
+# change domain: sed -i 's/^\([^\t]*\)\t[^\t;]*;/\1\tEukaryota;/' COI-5P.tax.tsv
+# remove uncultured on fasta
+# awk '
+# >     # Load IDs into an associative array
+# >     NR==FNR {ids[$0]=1; next}
+# > 
+# >     # If the line starts with ">", check against the IDs
+# >     /^>/ { 
+# >         # Extract the ID from the FASTA header
+# >         split($0, a, /[>|]/); 
+# >         fasta_id=a[2]; 
+# > 
+# >         # Determine if this entry should be printed
+# >         printit = !(fasta_id in ids)
+# >     }
+# > 
+# >     # Print the line if printit is true
+# >     {if(printit) print}
+# > ' "$id_file" "$fasta_file" > "$temp_fasta"
diff --git a/misc/bold2ednaexplorer.sh b/misc/bold2ednaexplorer.sh
@@ -0,0 +1,60 @@
+#! /bin/bash
+
+# loop through fasta
+# 1) check if marker = COI-5P
+# 2) parse taxon path: 
+# ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
+# [0, 1, 2, 3, 4, 6, 7]
+
+fasta_file="bold.fasta"
+fasta_out="Fungi.fasta"
+taxa_out="Fungi.tax.tsv"
+
+process_sequence() {
+    IFS='|' read -r id marker country taxonomy rest <<< "$current_header"
+    # Converting taxonomy to an array
+    IFS=',' read -r -a taxonomy_array <<< "$taxonomy"
+    # Removing subfamily and subspecies
+    unset taxonomy_array[5]
+    unset taxonomy_array[-1]
+    fungi_list=("Ascomycota" "Basidiomycota" "Chytridiomycota" "Glomeromycota" "Myxomycota" "Zygomycota") 
+    # Loop through the list and check each item
+    for phylum in "${fungi_list[@]}"; do
+        if [[ "$phylum" == "${taxonomy_array[1]}" ]]; then
+            # Convert the array back to a string, joined by commas
+            IFS=',' new_taxonomy=$(printf "%s," "${taxonomy_array[@]}")
+            new_taxonomy=${new_taxonomy%,} # Removing the trailing comma
+
+            # remove leading char
+            id="${id#>}"
+
+            # append to out files
+            echo ">$id" >> $fasta_out
+            echo $current_sequence >> $fasta_out
+            echo -e "$id\t$new_taxonomy" >> "$taxa_out"
+            break
+        fi
+    done
+}
+
+# Read the FASTA file line by line
+while IFS= read -r line
+do
+    if [[ $line == ">"* ]]; then
+        # Process the previous sequence
+        if [ -n "$current_header" ]; then
+            process_sequence
+        fi
+        # Update the current header and reset the sequence
+        current_header=$line
+        current_sequence=""
+    else
+        # Append the line to the current sequence
+        current_sequence+=$line
+    fi
+done < "$fasta_file"
+
+# Process the last sequence in the file
+if [ -n "$current_header" ]; then
+    process_sequence
+fi
diff --git a/misc/bold2taxid.py b/misc/bold2taxid.py
@@ -0,0 +1,109 @@
+import csv
+
+input_file = "COI-5P.tax.tsv"  # Your original TSV file
+output_file = "converted_COI-5P.tax.tsv"  # The new TSV file
+input_row_count = 0
+output_row_count = 0
+
+with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
+    tsv_reader = csv.reader(infile, delimiter='\t')
+    tsv_writer = csv.writer(outfile, delimiter='\t')
+
+    for row in tsv_reader:
+        input_row_count += 1
+
+        if len(row) < 2:
+            print(f"Skipping malformed row #{input_row_count}: {row}")
+            continue
+
+        original_id = row[0]
+        taxonomic_path = row[1].split(';')
+        smallest_taxonomic_level = None
+
+        # Reverse the taxonomic path and pick the first non-"NA" level
+        for taxon in reversed(taxonomic_path):
+            if taxon != "NA":
+                smallest_taxonomic_level = taxon
+                break
+
+        if smallest_taxonomic_level:
+            tsv_writer.writerow([smallest_taxonomic_level, original_id])
+            output_row_count += 1
+        else:
+            print(f"No valid taxonomic level found in row #{input_row_count}")
+
+# print(f"Conversion complete. Processed {input_row_count} rows and wrote {output_row_count} rows.")
+# print("The output is saved in", output_file)
+
+
+# taxonkit name2taxid converted_COI-5P.tax.tsv > COI-5P.bold.tax
+# awk -F "\t" '!seen[$2]++' COI-5P.bold.tax > COI-5P.bold.tax.unique; mv COI-5P.bold.tax.unique COI-5P.bold.tax
+# cut -f2,3 COI-5P.bold.tax > replacement_map.txt; mv replacement_map.txt COI-5P.bold.tax
+
+
+
+# 1) replace BOLD ID with Tax ID
+def pair_lines(file):
+    """Yield a pair of lines from the file at a time."""
+    line1 = next(file, None)
+    while line1 is not None:
+        line2 = next(file, "")
+        yield (line1.strip(), line2.strip())
+        line1 = next(file, None)
+
+# Replace 'file1.txt', 'file2.txt', and 'file3.txt' with your actual file paths
+with open('COI-5P.bold.tax', 'r') as file1, \
+     open('COI-5P.tax.tsv', 'r') as file2, \
+     open('COI-5P.fasta', 'r') as file3:
+    
+    with open('COI-5P.tax.tsv_tmp', 'w') as out2, \
+         open('COI-5P.fasta_tmp', 'w') as out3:
+
+        # Create an iterator for the third file
+        file3_pairs = pair_lines(file3)
+
+        # Iterate through the files
+        for line1, line2, (part1, part2) in zip(file1, file2, file3_pairs):
+            # line1: entry from file1
+            # line2: entry from file2
+            # part1, part2: two-line entry from file3
+
+            # Process the lines
+            # Example: print them
+            #TODO: OUT2 (taxa): line1[1] + '\t' + line2[1]
+            #TODO: OUT3 (fasta): '>'+line1[1] + '\n' + part2
+            parts=line1.strip().split('\t')
+            if len(parts) < 2:
+                continue
+            taxid=parts[1]
+            path=line2.strip().split('\t')[1]
+            out2.write(taxid + "\t" + path + '\n')
+            out3.write('>' + taxid + '\n')
+            out3.write(part2 + '\n')
+
+            # Add your processing logic here
+
+# mv COI-5P.fasta_tmp COI-5P.fasta
+# mv COI-5P.tax.tsv_tmp COI-5P.tax.tsv
+
+# 2) de-replicate on unique sequences
+unique_sequences = {}
+with open('COI-5P.tax.tsv', 'r') as file1, \
+     open('COI-5P.fasta', 'r') as file2:
+    
+    with open('COI-5P.tax.tsv_tmp', 'w') as out1, \
+         open('COI-5P.fasta_tmp', 'w') as out2, \
+         open('COI-5P_prune.txt', 'w') as out3:
+        
+        # Create an iterator for the third file
+        fasta_pairs = pair_lines(file2)
+
+        # Iterate through the files
+        for line1, (part1, part2) in zip(file1, fasta_pairs):
+            seq=part2.strip()
+            if seq not in unique_sequences:
+                out1.write(line1)
+                out2.write(part1 + '\n')
+                out2.write(part2 + '\n')
+            else:
+                out3.write(part1.lstrip(">") + '\n')