Skip to content

Commit

Permalink
Merge pull request #25 from weng-lab/twobit
Browse files Browse the repository at this point in the history
Retrieve two bit data in one hot encoded format
  • Loading branch information
NishiPhalke committed Mar 3, 2021
2 parents 38b38b8 + 6678fec commit 5695687
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 9 deletions.
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name": "genomic-reader",
"version": "1.4.7",
"version": "1.4.9",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"license": "MIT",
Expand Down
17 changes: 14 additions & 3 deletions src/bigwig/BigWigReader.ts
@@ -1,7 +1,7 @@
import { DataLoader, BufferedDataLoader, DataMissingError, FileFormatError } from "../loader/DataLoader";
import { BinaryParser } from "../util/BinaryParser";
import { loadHeaderData, HeaderData, FileType } from "./BigWigHeaderReader";
import { loadSequenceRecord, loadSequence, SequenceRecord, streamSequence } from "./TwoBitHeaderReader";
import { loadSequenceRecord, loadSequence, SequenceRecord, streamSequence, loadOneHotEncodingFromSequence } from "./TwoBitHeaderReader";
import { inflate } from "pako";
import { Stream, Readable, Writable, Duplex } from "stream";
import { start } from "repl";
Expand Down Expand Up @@ -181,16 +181,27 @@ export class BigWigReader {
return loadSequence(this.dataLoader, this.cachedHeader!, sequence, startBase, endBase);
}

/**
* Method for reading Two Bit Matrix data from TwoBit files.
*
* @param chrom the chromosome from which to read.
* @param startBase the starting base.
* @param endBase the ending base.
*/
async readTwoBitDataMatrix(chrom: string, startBase: number, endBase: number): Promise<Array<Array<number>>> {
const sequence: SequenceRecord = await this.getSequenceRecord(chrom);
return loadOneHotEncodingFromSequence(this.dataLoader, this.cachedHeader!, sequence, startBase, endBase);
}
/**
* Method for reading Two Bit sequence data from TwoBit files.
*
* @param chrom the chromosome from which to read.
* @param startBase the starting base.
* @param endBase the ending base.
*/
async streamTwoBitData(chrom: string, startBase: number, endBase: number, chunkSize: number = 1024): Promise<Readable> {
async streamTwoBitData(chrom: string, startBase: number, endBase: number, chunkSize: number = 1024, oneHotEncodedData= false): Promise<Readable> {
const sequence: SequenceRecord = await this.getSequenceRecord(chrom);
return streamSequence(this.dataLoader, this.cachedHeader!, sequence, startBase, endBase, chunkSize);
return streamSequence(this.dataLoader, this.cachedHeader!, sequence, startBase, endBase, chunkSize, oneHotEncodedData);
}

/**
Expand Down
45 changes: 40 additions & 5 deletions src/bigwig/TwoBitHeaderReader.ts
Expand Up @@ -15,6 +15,18 @@ function chararray(): (i: number) => string {
return (i: number): string => CHARARRAY[i];
};

const letters: Record<string,number[]> ={
A: [1,0,0,0],
C: [0,1,0,0],
G: [0,0,1,0],
T: [0,0,0,1],
N: [0,0,0,0],
a: [1,0,0,0],
c: [0,1,0,0],
g: [0,0,1,0],
t: [0,0,0,1],
n: [0,0,0,0]
}
/**
* Decodes a byte to a sequence of bases.
*
Expand Down Expand Up @@ -160,22 +172,45 @@ function rn(i: number): string {
}

export async function streamSequence(dataLoader: DataLoader, header: HeaderData,
sequence: SequenceRecord, start: number, end: number, chunkSize: number = 1024): Promise<Readable> {
sequence: SequenceRecord, start: number, end: number, chunkSize: number = 1024, oneHotEncodedData= false): Promise<Readable> {
const dataToBuffer = Math.ceil((end - start) / 4) + 1;
const bufferedLoader = new BufferedDataLoader(dataLoader, dataToBuffer, true);
const stream = new Readable({ read() {}, encoding: 'utf8' });
const matrixStream = new Readable({ read() {}, objectMode: true });
let currentStart = start;
while (currentStart < end) {
let currentEnd = currentStart + chunkSize - 1;
if (currentEnd >= end) currentEnd = end;
const seq = await loadSequence(bufferedLoader, header, sequence, currentStart, currentEnd);
stream.push(seq);
if(oneHotEncodedData) {
const matrix = await loadOneHotEncodingFromSequence(bufferedLoader, header, sequence, currentStart, currentEnd);
matrixStream.push(matrix)
} else {
const seq = await loadSequence(bufferedLoader, header, sequence, currentStart, currentEnd);
stream.push(seq);
}
currentStart = currentEnd + 1;
}
// This is the dumb way Readable streams are signalled to end.
stream.push(null);
return stream;

if(oneHotEncodedData)
{
matrixStream.push(null);
return matrixStream;

} else {
stream.push(null);
return stream;
}
}
export async function loadOneHotEncodingFromSequence(dataLoader: DataLoader|BufferedDataLoader, header: HeaderData,
sequence: SequenceRecord, start: number, end: number):Promise<Array<Array<number>>> {
const seq = await loadSequence(dataLoader, header, sequence, start, end)
let matrix: number[][] = []
for(let c of seq) {
matrix.push(letters[c])
}
return matrix;
}

/**
* Loads sequence data from a two-bit file.
Expand Down
14 changes: 14 additions & 0 deletions test/TwoBitReader.test.ts
Expand Up @@ -38,6 +38,7 @@ describe("TwoBitReader", () => {
const loader = new AxiosDataLoader(`http://localhost:8001/${testTwoBitFilename}`, Axios.create());
const reader = new BigWigReader(loader);
expect(await reader.readTwoBitData("seq1", 2, 10)).toEqual("CTGATGCTA");
expect(await reader.readTwoBitDataMatrix("seq1", 2, 5)).toEqual([[0,1,0,0],[0,0,0,1],[0,0,1,0],[1,0,0,0]]);
expect(await reader.readTwoBitData("seq1", 45, 48)).toEqual("NNNN");
expect(await reader.readTwoBitData("seq1", 44, 47)).toEqual("ANNN");
expect(await reader.readTwoBitData("seq1", 44, 87)).toEqual("ANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTA");
Expand All @@ -51,6 +52,7 @@ describe("TwoBitReader", () => {
const reader = new BigWigReader(loader);
expect(await reader.readTwoBitData("seq2", 1, 11)).toEqual("actgtgatcga");
expect(await reader.readTwoBitData("seq2", 21, 22)).toEqual("tG");
expect(await reader.readTwoBitDataMatrix("seq2", 21, 22)).toEqual([[0,0,0,1],[0,0,1,0]]);
expect(await reader.readTwoBitData("seq2", 77, 78)).toEqual("Gg");
expect(await reader.readTwoBitData("seq2", 106, 116)).toEqual("gtagccggcga");
});
Expand All @@ -66,4 +68,16 @@ describe("TwoBitReader", () => {
expect(chunkSizes).toEqual([32, 32, 32, 27]);
});

it("should stream one hot encoded data from seq1", async () => {
const loader = new AxiosDataLoader(`http://localhost:8001/${testTwoBitFilename}`, Axios.create());
const reader = new BigWigReader(loader);
const stream = await reader.streamTwoBitData("seq1", 2, 4, undefined, true);
const chunks: string[] = await streamToArray(stream);

expect(chunks[0]).toStrictEqual([[0,1,0,0],[0,0,0,1],[0,0,1,0]]);

const chunkSizes = chunks.map((ch) => ch.length);
expect(chunkSizes).toEqual([3]);
});

});

0 comments on commit 5695687

Please sign in to comment.