Skip to content

Commit

Permalink
Accommodate '@' as a valid character in the quality strings.
Browse files Browse the repository at this point in the history
This requires us to use a length-based approach to finding the end
of a quality string, given that '@' is not a reliable terminator.
  • Loading branch information
LTLA committed Oct 20, 2023
1 parent 26a4b4f commit 5d3214a
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 29 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.24)

project(kaori
VERSION 1.1.0
VERSION 1.1.1
DESCRIPTION "Header-only C++ library for screen counting"
LANGUAGES CXX)

Expand Down
34 changes: 18 additions & 16 deletions include/kaori/FastqReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,7 @@ class FastqReader {
FastqReader(byteme::Reader* p) : pb(p) {
sequence.reserve(200);
name.reserve(200);

okay = pb.valid();
if (okay) {
if (pb.get() != '@') {
throw std::runtime_error("first line containing FASTQ name should start with '@'");
}
}
}

/**
Expand All @@ -55,7 +49,12 @@ class FastqReader {

// Processing the name. This should be on a single line, hopefully.
name.clear();
char val = advance_and_check();
char val = pb.get();
if (val != '@') {
throw std::runtime_error("read name should start with '@' (starting line " + std::to_string(init_line + 1) + ")");
}

val = advance_and_check();
while (!std::isspace(val)) {
name.push_back(val);
val = advance_and_check();
Expand Down Expand Up @@ -84,26 +83,29 @@ class FastqReader {
}
++line_count;

// Processing the qualities. Extraction is allowed to fail if we're at the
// end of the file.
size_t qual_length = 0;
// Processing the qualities. Extraction is allowed to fail if we're at
// the end of the file. Note that we can't check for '@' as a
// delimitor, as this can be a valid score, so instead we check at each
// newline whether we've reached the specified length, and quit if so.
size_t qual_length = 0, seq_length = sequence.size();
okay = false;

while (pb.advance()) {
val = pb.get();
if (val == '@') {
okay = true;
break;
}
if (val != '\n') {
++qual_length;
} else if (qual_length >= seq_length) {
okay = pb.advance(); // sneak past the newline.
break;
}
}
++line_count;

if (qual_length != sequence.size()) {
if (qual_length != seq_length) {
throw std::runtime_error("non-equal lengths for quality and sequence strings (starting line " + std::to_string(init_line + 1) + ")");
}

++line_count;

return true;
}

Expand Down
73 changes: 61 additions & 12 deletions tests/src/FastqReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,29 +51,42 @@ TEST(BasicTests, MultipleEntries) {
}

TEST(BasicTests, MultiLineEntries) {
std::string buffer = "@FOO\nA\nCG\nTGCA\n+\n!!\n!!\n!!!";
byteme::RawBufferReader reader(reinterpret_cast<const unsigned char*>(buffer.c_str()), buffer.size());
kaori::FastqReader fq(&reader);
for (size_t i = 0; i < 2; ++i) {
std::string buffer = "@FOO\nA\nCG\nTGCA\n+\n!!\n!!\n!!!\n@ARG\nACACGGT\nC\n+\n@@@\n@\n@@@@";
if (i == 2) {
buffer += '\n';
}

EXPECT_TRUE(fq());
const auto& name = fq.get_name();
EXPECT_EQ(std::string(name.begin(), name.end()), "FOO");
const auto& seq = fq.get_sequence();
EXPECT_EQ(std::string(seq.begin(), seq.end()), "ACGTGCA");
byteme::RawBufferReader reader(reinterpret_cast<const unsigned char*>(buffer.c_str()), buffer.size());
kaori::FastqReader fq(&reader);

EXPECT_FALSE(fq());
EXPECT_TRUE(fq());
const auto& name = fq.get_name();
EXPECT_EQ(std::string(name.begin(), name.end()), "FOO");
const auto& seq = fq.get_sequence();
EXPECT_EQ(std::string(seq.begin(), seq.end()), "ACGTGCA");

EXPECT_TRUE(fq());
const auto& name2 = fq.get_name();
EXPECT_EQ(std::string(name2.begin(), name2.end()), "ARG");
const auto& seq2 = fq.get_sequence();
EXPECT_EQ(std::string(seq2.begin(), seq2.end()), "ACACGGTC");

EXPECT_FALSE(fq());
}
}

TEST(BasicTests, Errors) {
{
std::string buffer = "FOO";
byteme::RawBufferReader reader(reinterpret_cast<const unsigned char*>(buffer.c_str()), buffer.size());
kaori::FastqReader fq(&reader);

EXPECT_ANY_THROW({
try {
kaori::FastqReader fq(&reader);
fq();
} catch (std::exception& e) {
EXPECT_TRUE(std::string(e.what()).find("first line") != std::string::npos);
EXPECT_TRUE(std::string(e.what()).find("read name should start") != std::string::npos);
throw e;
}
});
Expand All @@ -95,7 +108,25 @@ TEST(BasicTests, Errors) {
}

{
std::string buffer = "@FOO\nAC\n+\n!!\n@WHEE\nACGT\n+\n!!";
std::string buffer = "@FOO\nAC\n+\n!!\n@WHEE\nACGT\n+\n!!"; // too short.
byteme::RawBufferReader reader(reinterpret_cast<const unsigned char*>(buffer.c_str()), buffer.size());
kaori::FastqReader fq(&reader);
fq();

EXPECT_ANY_THROW({
try {
fq();
} catch (std::exception& e) {
std::string msg(e.what());
EXPECT_TRUE(msg.find("non-equal lengths") != std::string::npos);
EXPECT_TRUE(msg.find("line 5") != std::string::npos);
throw e;
}
});
}

{
std::string buffer = "@FOO\nAC\n+\n!!\n@WHEE\nACGT\n+\n!!!@@!!@\n"; // too long.
byteme::RawBufferReader reader(reinterpret_cast<const unsigned char*>(buffer.c_str()), buffer.size());
kaori::FastqReader fq(&reader);
fq();
Expand All @@ -111,6 +142,24 @@ TEST(BasicTests, Errors) {
}
});
}

{
std::string buffer = "@FOO\nAC\n+\n!!\nWHEE";
byteme::RawBufferReader reader(reinterpret_cast<const unsigned char*>(buffer.c_str()), buffer.size());
kaori::FastqReader fq(&reader);
fq();

EXPECT_ANY_THROW({
try {
fq();
} catch (std::exception& e) {
std::string msg(e.what());
EXPECT_TRUE(msg.find("should start") != std::string::npos);
EXPECT_TRUE(msg.find("line 5") != std::string::npos);
throw e;
}
});
}
}

class FastqReaderFileTest : public testing::TestWithParam<int> {};
Expand Down

0 comments on commit 5d3214a

Please sign in to comment.