Skip to content

Commit

Permalink
Temporary commit. Needs to be amended!
Browse files Browse the repository at this point in the history
Working Notes:

- Profile and LightweightProfile are too divergent currently. The
  formats are completely different. What is intended?
- Added toString(format, width) method, but some formats are fixed-width
  and should ignore this parameter and hard-code it.
- Added 'CONCENSUS' method with the idea that this would implement the
  request in biojava#983. However I'm not sure what the equivalent is in an MSA.
  Also the current version isn't actually the desired output.
- ProfileFormatTest is just a main method for quick testing
  • Loading branch information
sbliven committed Jan 27, 2022
1 parent cf1cc42 commit a4006f0
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package org.biojava.nbio.alignment;

import java.util.Arrays;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
import org.biojava.nbio.core.alignment.template.AlignedSequence;
import org.biojava.nbio.core.alignment.template.Profile;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.AccessionID;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.template.Sequence;

public class ProfileFormatTest {

private static boolean isSimilar(SubstitutionMatrix<AminoAcidCompound> matrix, AminoAcidCompound aa1,
AminoAcidCompound aa2) {
short val = matrix.getValue(aa1, aa2);
return val > 0;
}

public static <S extends Sequence<C>, C extends AminoAcidCompound> String customFormat(
SmithWaterman<S, C> alignment) {
SubstitutionMatrix<AminoAcidCompound> blosum62 = SubstitutionMatrixHelper.getBlosum62();
AlignedSequence<S, C> seq1 = alignment.getPair().getAlignedSequence(1);
AlignedSequence<S, C> seq2 = alignment.getPair().getAlignedSequence(2);
String s1 = seq1.toString();
String s2 = seq2.toString();
String start1 = seq1.getStart().getPosition().toString();
String start2 = seq2.getStart().getPosition().toString();
int startWidth = Math.max(start1.length(), start2.length());

StringBuilder s = new StringBuilder();
s.append(String.format("%" + startWidth + "d ", seq1.getStart().getPosition()));
s.append(s1);
s.append(String.format(" %d", seq1.getEnd().getPosition()));
s.append(String.format("%n"));

s.append(String.format("%" + startWidth + "s", " "));
for (int i = 0; i < s1.toString().length(); i++) {
if (i >= s2.length())
break;

C c1 = seq1.getCompoundAt(i + 1);
C c2 = seq2.getCompoundAt(i + 1);
s.append(isSimilar(blosum62, c1, c2) ? '|' : ' ');
}
s.append(String.format("%n"));
s.append(String.format("%" + startWidth + "d ", seq2.getStart().getPosition()));
s.append(s2);
s.append(String.format(" %d", seq2.getEnd().getPosition()));

return s.toString();
}

public static void main(String[] args) throws CompoundNotFoundException {
// Test alignment
ProteinSequence query = new ProteinSequence("AERNDKK");
ProteinSequence target = new ProteinSequence("ERDNKGFPS");
query.setAccession(new AccessionID("query"));
target.setAccession(new AccessionID("target"));
SimpleGapPenalty gaps = new SimpleGapPenalty((short) 2, (short) 1);
SubstitutionMatrix<AminoAcidCompound> blosum62 = SubstitutionMatrixHelper.getBlosum62();
SmithWaterman<ProteinSequence, AminoAcidCompound> alignment = new SmithWaterman<ProteinSequence, AminoAcidCompound>(
query, target, gaps, blosum62);
String s = customFormat(alignment);

System.out.format("### custom:%n%s%n%n", s);
System.out.format("### toString:%n%s%n%n", alignment.getPair());

Map<Profile.StringFormat, String> fmts = Arrays.stream(Profile.StringFormat.values()).collect(
Collectors.toMap(Function.identity(), f -> alignment.getPair().toString(f)));
fmts.forEach((f, str) -> System.out.format("### %s:%n%s%n%n", f, str));

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
import org.biojava.nbio.core.sequence.template.Compound;
import org.biojava.nbio.core.sequence.template.CompoundSet;
import org.biojava.nbio.core.sequence.template.Sequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Serializable;
import java.util.*;
Expand All @@ -51,7 +53,7 @@
* @param <C> each element of an {@link AlignedSequence} is a {@link Compound} of type C
*/
public class SimpleProfile<S extends Sequence<C>, C extends Compound> implements Serializable, Profile<S, C> {

private static Logger logger = LoggerFactory.getLogger(SimpleProfile.class);

private static final long serialVersionUID = 1L;

Expand Down Expand Up @@ -376,25 +378,38 @@ public String toString(int width) {
}

@Override
public String toString(StringFormat format) {
public String toString(StringFormat format, int width) {
switch (format) {
default:
logger.warn("Unrecognized StringFormat {}", format);
// fall through
case ALN:
case CLUSTALW:
default:
return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(list) + " ",
return toString(width, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(list) + " ",
false, true, true, false, true, false);
case FASTA:
return toString(60, null, ">%s%n", false, false, false, false, false, false);
return toString(width, null, ">%s%n", false, false, false, false, false, false);
case GCG:
case MSF:
return toString(50, IOUtils.getGCGHeader(list), IOUtils.getIDFormat(list), false, false, true, false,
return toString(width, IOUtils.getGCGHeader(list), IOUtils.getIDFormat(list), false, false, true, false,
false, false);
case PDBWEB:
return toString(60, null, "%10s", true, true, true, false, true, true);
return toString(width, null, "%10s", true, true, true, false, true, true);
case CONCENSUS:
return toString(width, null, null, false, true, true, false, true, false);
}
}

// method from Object
@Override
public String toString(StringFormat format) {
int width;
if(format == StringFormat.GCG || format == StringFormat.MSF) {
width = 50;
} else {
width = 60;
}
return toString(format, width);
}

@Override
public String toString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ enum StringFormat {
*<div class="subText"> <b>Legend:</b> <span style="color:green">Green</span> - identical residues | <span style="color:pink">Pink</span> - similar residues | <span style="color:blue">Blue</span> - sequence mismatch | <span style="color:brown">Brown</span> - insertion/deletion | </div>
*/
PDBWEB,
CONCENSUS,
}

/**
Expand Down Expand Up @@ -341,4 +342,13 @@ enum StringFormat {
*/
String toString(StringFormat format);

/**
* Returns a formatted view of the alignment profile. Details depend on the format given.
*
* @param format output format
* @param width limit on the line length
* @return a formatted view of the alignment profile
*/
String toString(StringFormat format, int width);

}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.biojava.nbio.core.sequence.template.CompoundSet;
import org.biojava.nbio.core.sequence.template.LightweightProfile;
import org.biojava.nbio.core.sequence.template.Sequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collections;
Expand All @@ -41,6 +43,7 @@
* @author Mark Chapman
*/
public class MultipleSequenceAlignment<S extends Sequence<C>, C extends Compound> implements LightweightProfile<S, C> {
private static Logger logger = LoggerFactory.getLogger(MultipleSequenceAlignment.class);

private List<S> sequences = new ArrayList<S>();
private Integer length = null;
Expand Down Expand Up @@ -149,22 +152,37 @@ public String toString(int width) {
* @return String in one of the supported file formats.
*/
@Override
public String toString(StringFormat format) {
public String toString(StringFormat format, int width) {
switch (format) {
default:
logger.warn("Unrecognized StringFormat {}", format);
// fall through
case ALN:
case CLUSTALW:
default:
return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(sequences) +
return toString(width, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(sequences) +
" ", true, false, true, false);
case FASTA:
return toString(60, null, ">%s%n", false, false, false, false);
return toString(width, null, ">%s%n", false, false, false, false);
case GCG:
case MSF:
return toString(50, IOUtils.getGCGHeader(sequences), IOUtils.getIDFormat(sequences), true, false, false,
return toString(width, IOUtils.getGCGHeader(sequences), IOUtils.getIDFormat(sequences), true, false, false,
false);
case PDBWEB:
return toString(60, null, "%s", true, false, true, true);
return toString(width, null, "%s", true, false, true, true);
case CONCENSUS:
return toString(width, null, null, true, false, true, false);
}
}

@Override
public String toString(StringFormat format) {
int width;
if(format == StringFormat.GCG || format == StringFormat.MSF) {
width = 50;
} else {
width = 60;
}
return toString(format, width);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ enum StringFormat {
FASTA,
GCG,
MSF,
PDBWEB
PDBWEB,
CONCENSUS,
}

/**
Expand Down Expand Up @@ -121,4 +122,12 @@ enum StringFormat {
*/
String toString(StringFormat format);

/**
* Returns a formatted view of the alignment profile. Details depend on the format given.
*
* @param format output format
* @param width limit on the line length
* @return a formatted view of the alignment profile
*/
String toString(StringFormat format, int width);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ARND- 4
| |
-R-DG 3
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX
ARNDCEQGHILKMFPSTWYVBZJX

0 comments on commit a4006f0

Please sign in to comment.