Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More consistent MOLfile/SDfile property handling. #972

Closed
wants to merge 10 commits into from
21 changes: 11 additions & 10 deletions base/core/src/main/java/org/openscience/cdk/io/ChemObjectIO.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,16 @@ public void removeChemObjectIOListener(IChemObjectIOListener listener) {
*/
@Override
public <S extends IOSetting> S addSetting(IOSetting setting) {
if (hasSetting(setting.getName())) {
try {
S current = getSetting(setting.getName());
current.setSetting(setting.getSetting());
return current;
} catch (CDKException ex) {
// setting value was invalid (ignore as we already have a value for this setting
// and we can't throw CDKException as IChemObject is in interfaces module)
}
}
return (S) settings.add(setting);
}

Expand All @@ -91,16 +101,7 @@ public <S extends IOSetting> S addSetting(IOSetting setting) {
@Override
public void addSettings(Collection<IOSetting> settings) {
for (IOSetting setting : settings) {
if (hasSetting(setting.getName())) {
try {
getSetting(setting.getName()).setSetting(setting.getSetting());
} catch (CDKException ex) {
// setting value was invalid (ignore as we already have a value for this setting
// and we can't throw CDKException as IChemObject is in interfaces module)
}
} else {
addSetting(setting);
}
addSetting(setting);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public class MDLV2000Reader extends DefaultChemObjectReader {
private static final Pattern TRAILING_SPACE = Pattern.compile("\\s+$");

/** Delimits Structure-Data (SD) Files. */
private static final String RECORD_DELIMITER = "$$$$";
private static final String SDF_RECORD_DELIMITER = "$$$$";

/** Valid pseudo labels. */
private static final Set<String> PSEUDO_LABELS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("*","A","Q","L","LP","R","R#")));
Expand Down Expand Up @@ -327,9 +327,9 @@ private IAtomContainer readAtomContainer(IAtomContainer molecule) throws CDKExce
return null;
}

if (line.startsWith("$$$$")) {
if (line.startsWith(SDF_RECORD_DELIMITER))
return molecule;
}

if (line.length() > 0) {
title = line;
}
Expand Down Expand Up @@ -2329,7 +2329,7 @@ static String dataHeader(final String line) {
* @return the line indicates the end of a record was reached
*/
private static boolean endOfRecord(final String line) {
return line == null || line.equals(RECORD_DELIMITER);
return line == null || line.startsWith(SDF_RECORD_DELIMITER);
}

/**
Expand Down
134 changes: 127 additions & 7 deletions storage/ctab/src/main/java/org/openscience/cdk/io/MDLV2000Writer.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.openscience.cdk.sgroup.Sgroup;
import org.openscience.cdk.sgroup.SgroupBracket;
import org.openscience.cdk.sgroup.SgroupKey;
import org.openscience.cdk.smiles.InvPair;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
Expand All @@ -73,6 +74,7 @@
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
Expand Down Expand Up @@ -124,8 +126,11 @@ public class MDLV2000Writer extends DefaultChemObjectWriter {
public static final String OptWriteQueryFormatValencies = "WriteQueryFormatValencies";
public static final String OptWriteDefaultProperties = "WriteDefaultProperties";
public static final String OptProgramName = "ProgramName";
public static final String OptWriteData = "writeProperties";
public static final String OptTruncateLongData = "TruncateLongData";

private final static ILoggingTool logger = LoggingToolFactory.createLoggingTool(MDLV2000Writer.class);
private static final int MAX_SDTAG_LENGTH = 200;

// regular expression to capture R groups with attached numbers
private final Pattern NUMERED_R_GROUP = Pattern.compile("R(\\d+)");
Expand Down Expand Up @@ -214,12 +219,33 @@ public static SPIN_MULTIPLICITY ofValue(int value) throws CDKException {
@Deprecated
private BooleanIOSetting writeQueryFormatValencies;

private BooleanIOSetting writeDefaultProps;
private BooleanIOSetting writeTrailingZeros;

private BooleanIOSetting writeDataOpt;

private BooleanIOSetting truncateDataOpt;

private StringIOSetting programNameOpt;

private Set<String> acceptedSdTags = null;

private BufferedWriter writer;

/**
* A list of properties used by CDK algorithms which should not be
* serialized into the SD file format (noise).
*/
static final Set<String> SD_TAGS_TO_IGNORE = new HashSet<>();

static {
SD_TAGS_TO_IGNORE.add(InvPair.CANONICAL_LABEL);
SD_TAGS_TO_IGNORE.add(InvPair.INVARIANCE_PAIR);
SD_TAGS_TO_IGNORE.add(CDKConstants.CTAB_SGROUPS);
// TITLE/REMARK will be written in Molfile header
SD_TAGS_TO_IGNORE.add(CDKConstants.TITLE);
SD_TAGS_TO_IGNORE.add(CDKConstants.REMARK);
}

/**
* Constructs a new MDLWriter that can write an {@link IAtomContainer}
* to the MDL molfile format.
Expand Down Expand Up @@ -249,6 +275,10 @@ public MDLV2000Writer() {
this(new StringWriter());
}

void setAcceptedSdTags(Set<String> acceptedSdTags) {
this.acceptedSdTags = acceptedSdTags;
}

@Override
public IResourceFormat getFormat() {
return MDLFormat.getInstance();
Expand Down Expand Up @@ -529,7 +559,7 @@ public void writeMolecule(IAtomContainer container) throws Exception {
line.append(formatMDLInt(atomprops[0], 2)); // dd (mass-number)
line.append(formatMDLInt(atomprops[1], 3)); // ccc (charge)
int last = atomprops.length-1;
if (!writeDefaultProps.isSet())
if (!writeTrailingZeros.isSet())
{
while (last >= 0) {
if (atomprops[last] != 0)
Expand Down Expand Up @@ -656,7 +686,7 @@ else if (e.equals(new Expr(Expr.Type.ALIPHATIC_ORDER, 2).or(new Expr(Expr.Type.I
default:
line.append("0");
}
if (writeDefaultProps.isSet())
if (writeTrailingZeros.isSet())
line.append(" 0 0 0");
line.append('\n');
writer.write(line.toString());
Expand Down Expand Up @@ -800,6 +830,16 @@ else if (e.equals(new Expr(Expr.Type.ALIPHATIC_ORDER, 2).or(new Expr(Expr.Type.I
// close molecule
writer.write("M END");
writer.write('\n');

// write non-structural data (mol properties in our case)
if (writeDataOpt.isSet()) {
MDLV2000Writer.writeNonStructuralData(writer,
container,
SD_TAGS_TO_IGNORE,
acceptedSdTags,
truncateDataOpt.isSet());
}

writer.flush();
}

Expand Down Expand Up @@ -1389,6 +1429,80 @@ protected static String formatMDLString(String s, int le) {
return s;
}

private static String replaceInvalidHeaderChars(String headerKey) {
return headerKey.replaceAll("[-<>.=% ]", "_");
}

private static boolean isPrimitiveDataValue(Object obj) {
return obj == null ||
obj.getClass() == String.class ||
obj.getClass() == Integer.class ||
obj.getClass() == Double.class ||
obj.getClass() == Boolean.class ||
obj.getClass() == Float.class ||
obj.getClass() == Byte.class ||
obj.getClass() == Short.class ||
obj.getClass() == Character.class;
}

/**
* Write non-structural SDfile key/value pairs.
*/
static void writeNonStructuralData(BufferedWriter wtr,
final IAtomContainer mol,
final Set<String> reject,
final Set<String> accept,
final boolean truncate) throws IOException {
Map<Object, Object> sdFields = mol.getProperties();
if (sdFields == null)
return;

for (Map.Entry<Object,Object> e : sdFields.entrySet()) {

final String key = e.getKey().toString();
if (reject != null && reject.contains(key))
continue;
if (accept != null && !accept.contains(key))
continue;

final String cleanHeaderKey = replaceInvalidHeaderChars(key);
if (!cleanHeaderKey.equals(key))
logger.info("Replaced characters in SDfile data header: ",
key, " written as: ", cleanHeaderKey);

final Object val = e.getValue();
if (!isPrimitiveDataValue(val)) {
logger.info("Skipped property " + key,
" because only primitive and string properties",
" can be written by SDFWriter");
continue;
}

wtr.write("> <");
wtr.write(cleanHeaderKey);
wtr.write(">\n");

if (val == null)
continue;

String valStr = val.toString();
if (truncate && valStr.length() > MAX_SDTAG_LENGTH) {
StringBuilder sb = new StringBuilder();
for (String line : valStr.split("\n")) {
if (line.length() > MAX_SDTAG_LENGTH)
sb.append(line, 0, MAX_SDTAG_LENGTH);
else
sb.append(line);
sb.append("\n");
}
valStr = sb.toString();
}
wtr.append(valStr);
wtr.append("\n\n");
}

}

/**
* Initializes IO settings.<br>
* Please note with regards to "writeAromaticBondTypes": bond type values 4 through 8 are for SSS queries only,
Expand All @@ -1403,14 +1517,20 @@ private void initIOSettings() {
"Should aromatic bonds be written as bond type 4?", "false"));
writeQueryFormatValencies = addSetting(new BooleanIOSetting(OptWriteQueryFormatValencies,
IOSetting.Importance.LOW, "Should valencies be written in the MDL Query format? (deprecated)", "false"));
writeDefaultProps = addSetting(new BooleanIOSetting(OptWriteDefaultProperties,
IOSetting.Importance.LOW,
"Write trailing zero's on atom/bond property blocks even if they're not used.",
"true"));
writeTrailingZeros = addSetting(new BooleanIOSetting(OptWriteDefaultProperties,
IOSetting.Importance.LOW,
"Write trailing zero's on atom/bond property blocks even if they're not used.",
"true"));
programNameOpt = addSetting(new StringIOSetting(OptProgramName,
IOSetting.Importance.LOW,
"Program name to write at the top of the molfile header, should be exactly 8 characters long",
"CDK"));
writeDataOpt = addSetting(new BooleanIOSetting(OptWriteData,
IOSetting.Importance.LOW,
"Should molecule properties be written as non-structural data", "false"));
truncateDataOpt = addSetting(new BooleanIOSetting(OptTruncateLongData,
IOSetting.Importance.LOW,
"Truncate long data files >200 characters", "false"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.openscience.cdk.io;

import org.openscience.cdk.CDK;
import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.config.Elements;
import org.openscience.cdk.exception.CDKException;
Expand Down Expand Up @@ -76,6 +77,7 @@
*/
public class MDLV3000Reader extends DefaultChemObjectReader {

public static final String M_END = "M END";
private BooleanIOSetting optForce3d;
private BooleanIOSetting optHydIso;
private BooleanIOSetting optStereoPerc;
Expand Down Expand Up @@ -234,6 +236,15 @@ public IAtomContainer readConnectionTable(IChemObjectBuilder builder) throws CDK
lastLine = readLine();
}

// read in any SDF fields
if (lastLine != null && lastLine.startsWith(M_END)) {
try {
MDLV2000Reader.readNonStructuralData(input, state.mol);
} catch (IOException ex) {
throw new CDKException("IO Error", ex);
}
}

finalizeMol(state);

return readData;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.openscience.cdk.interfaces.ITetrahedralChirality.Stereo;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.MDLV3000Format;
import org.openscience.cdk.io.setting.BooleanIOSetting;
import org.openscience.cdk.io.setting.IOSetting;
import org.openscience.cdk.io.setting.StringIOSetting;
import org.openscience.cdk.sgroup.Sgroup;
Expand Down Expand Up @@ -70,6 +71,8 @@

import static org.openscience.cdk.CDKConstants.ATOM_ATOM_MAPPING;
import static org.openscience.cdk.io.MDLV2000Writer.OptProgramName;
import static org.openscience.cdk.io.MDLV2000Writer.OptTruncateLongData;
import static org.openscience.cdk.io.MDLV2000Writer.OptWriteData;

/**
* Ctab V3000 format output. This writer provides output to the more modern (but less widely
Expand All @@ -92,6 +95,12 @@ public final class MDLV3000Writer extends DefaultChemObjectWriter {
private V30LineWriter writer;
private StringIOSetting programNameOpt;

private BooleanIOSetting writeDataOpt;

private BooleanIOSetting truncateDataOpt;

private Set<String> acceptedSdTags;

/**
* Create a new V3000 writer, output to the provided JDK writer.
*
Expand Down Expand Up @@ -119,6 +128,10 @@ public MDLV3000Writer() {
initIOSettings();
}

void setAcceptedSdTags(Set<String> acceptedSdTags) {
this.acceptedSdTags = acceptedSdTags;
}

/**
* Safely access nullable int fields by defaulting to zero.
*
Expand Down Expand Up @@ -712,6 +725,14 @@ private void writeMol(IAtomContainer mol) throws IOException, CDKException {

writer.write("END CTAB\n");
writer.writeDirect("M END\n");
// write non-structural data (mol properties in our case)
if (writeDataOpt.isSet()) {
MDLV2000Writer.writeNonStructuralData(writer.writer,
mol,
MDLV2000Writer.SD_TAGS_TO_IGNORE,
acceptedSdTags,
truncateDataOpt.isSet());
}
writer.writer.flush();
}

Expand Down Expand Up @@ -829,14 +850,14 @@ private static final class V30LineWriter implements Closeable {
public static final int LIMIT = 78; // -\n takes two chars (80 total)

// the base writer instance
private final Writer writer;
private final BufferedWriter writer;

// tracks the current line length
private int currLength = 0;

public V30LineWriter(Writer writer) {
if (writer instanceof BufferedWriter) {
this.writer = writer;
this.writer = (BufferedWriter)writer;
} else {
this.writer = new BufferedWriter(writer);
}
Expand Down Expand Up @@ -996,6 +1017,12 @@ private void initIOSettings() {
IOSetting.Importance.LOW,
"Program name to write at the top of the molfile header, should be exactly 8 characters long",
"CDK"));
writeDataOpt = addSetting(new BooleanIOSetting(OptWriteData,
IOSetting.Importance.LOW,
"Should molecule properties be written as non-structural data", "false"));
truncateDataOpt = addSetting(new BooleanIOSetting(OptTruncateLongData,
IOSetting.Importance.LOW,
"Truncate long data files >200 characters", "false"));
}

public void customizeJob() {
Expand Down