diff --git a/DB.c b/DB.c index f0f654c..27b202e 100644 --- a/DB.c +++ b/DB.c @@ -73,14 +73,20 @@ char *Prog_Name; +#ifdef INTERACTIVE + +char Ebuffer[1000]; + +#endif + void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) - fprintf(stderr,"%s: Out of memory\n",Prog_Name); + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else - fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg); + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } @@ -88,9 +94,9 @@ void *Malloc(int64 size, char *mesg) void *Realloc(void *p, int64 size, char *mesg) { if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) - fprintf(stderr,"%s: Out of memory\n",Prog_Name); + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else - fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg); + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } @@ -102,9 +108,9 @@ char *Strdup(char *name, char *mesg) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) - fprintf(stderr,"%s: Out of memory\n",Prog_Name); + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else - fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg); + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } @@ -115,7 +121,7 @@ FILE *Fopen(char *name, char *mode) if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) - fprintf(stderr,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); + EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } @@ -181,7 +187,7 @@ char *Catenate(char *path, char *sep, char *root, char *suffix) if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) - { fprintf(stderr,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); + { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } @@ -201,7 +207,7 @@ char *Numbered_Suffix(char *left, int num, char *right) if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) - { fprintf(stderr,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); + { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } @@ -383,19 +389,21 @@ void Number_Read(char *s) // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: -// -1: The DB could not be opened for a reason reported by the routine to stderr +// -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) -{ char *root, *pwd, *bptr, *fptr, *cat; - int nreads; - FILE *index, *dbvis; - int status, plen, isdam; - int part, cutoff, all; - int ufirst, tfirst, ulast, tlast; +{ HITS_DB dbcopy; + char *root, *pwd, *bptr, *fptr, *cat; + int nreads; + FILE *index, *dbvis; + int status, plen, isdam; + int part, cutoff, all; + int ufirst, tfirst, ulast, tlast; status = -1; + dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) @@ -418,22 +426,24 @@ int Open_DB(char* path, HITS_DB *db) isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) - exit (1); + return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) - exit (1); + return (-1); if ((dbvis = fopen(cat,"r")) == NULL) - { fprintf(stderr,"%s: Could not open database %s\n",Prog_Name,path); - goto exit; + { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); + goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) - goto exit1; + goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + goto error2; + } { int p, nblocks, nfiles; int64 size; @@ -441,35 +451,45 @@ int Open_DB(char* path, HITS_DB *db) nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else - { fprintf(stderr,"%s: DB %s has not yet been partitioned, cannot request a block !\n", + { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); - goto exit2; + goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } if (part > nblocks) - { fprintf(stderr,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); - goto exit2; + { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); + goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } } else { ufirst = tfirst = 0; @@ -491,7 +511,10 @@ int Open_DB(char* path, HITS_DB *db) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(db->reads); + goto error2; + } } else { HITS_READ *reads; @@ -503,7 +526,10 @@ int Open_DB(char* path, HITS_DB *db) fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(reads); + goto error2; + } totlen = 0; maxlen = 0; @@ -524,22 +550,27 @@ int Open_DB(char* path, HITS_DB *db) db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); + if (db->path == NULL) + goto error2; db->bases = NULL; db->loaded = 0; status = isdam; -exit2: +error2: fclose(index); -exit1: +error1: fclose(dbvis); -exit: +error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); + if (status < 0) + *db = dbcopy; + return (status); } @@ -682,61 +713,58 @@ void Close_DB(HITS_DB *db) HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing -void Load_QVs(HITS_DB *db) +int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; + char *root; uint16 *table; - QVcoding *coding; HITS_QV *qvtrk; + QVcoding *coding, *nx; + int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) - return; + return (0); if (db->trimmed) - { fprintf(stderr,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); + EXIT(1); } if (db->reads[db->nreads-1].coff < 0) - { fprintf(stderr,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); + EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) - return; - if (db->part > 0) - { indx = Fopen(Catenate(db->path,"","",".idx"),"r"); - if (indx == NULL) - exit (1); - } - else - indx = NULL; + return (-1); - { char *x; + istub = NULL; + indx = NULL; + table = NULL; + coding = NULL; + qvtrk = NULL; - x = rindex(db->path,'/'); - *x = '\0'; - istub = Fopen(Catenate(db->path,"/",x+2,".db"),"r"); - if (istub == NULL) - exit (1); - *x = '/'; - } + root = rindex(db->path,'/') + 2; + istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); + if (istub == NULL) + goto error; - { int first, last, nfiles, ncodes; + { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; - table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); - if (fscanf(istub,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } if (db->part > 0) - { int pfirst, plast; - int fbeg, fend; - int n, k; + { int pfirst, plast; + int fbeg, fend; + int n, k; + FILE *indx; // Determine first how many and which files span the block (fbeg to fend) @@ -746,7 +774,9 @@ void Load_QVs(HITS_DB *db) first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } if (last > pfirst) break; first = last; @@ -755,31 +785,36 @@ void Load_QVs(HITS_DB *db) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } first = last; } + indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (indx == NULL || coding == NULL || table == NULL) + { ncodes = 0; + goto error; + } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); - if (fscanf(istub,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) - { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) - { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) @@ -787,13 +822,26 @@ void Load_QVs(HITS_DB *db) fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + ncodes = i; + goto error; + } fseeko(quiva,read.coff,SEEK_SET); - coding[i] = *Read_QVcoding(quiva); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); - coding[i] = *Read_QVcoding(quiva); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } @@ -807,9 +855,10 @@ void Load_QVs(HITS_DB *db) table[j++] = (uint16) i; first = last; - } + } fclose(indx); + indx = NULL; } else @@ -818,14 +867,24 @@ void Load_QVs(HITS_DB *db) ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (coding == NULL || table == NULL) + goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } fseeko(quiva,db->reads[first].coff,SEEK_SET); - coding[i] = *Read_QVcoding(quiva); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) @@ -839,9 +898,13 @@ void Load_QVs(HITS_DB *db) // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); + if (qvtrk == NULL) + goto error; + qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); + if (qvtrk->name == NULL) + goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; - qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; @@ -849,7 +912,25 @@ void Load_QVs(HITS_DB *db) } fclose(istub); - fclose(indx); + return (0); + +error: + if (qvtrk != NULL) + free(qvtrk); + if (table != NULL) + free(table); + if (coding != NULL) + { int i; + for (i = 0; i < ncodes; i++) + Free_QVcoding(coding+i); + free(coding); + } + if (indx != NULL) + fclose(indx); + if (istub != NULL) + fclose(istub); + fclose(quiva); + EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory @@ -939,11 +1020,12 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) int treads, ureads; void *anno; void *data; + char *name; HITS_TRACK *record; if (track[0] == '.') - { fprintf(stderr,"Track names cannot begin with a .\n"); - exit (1); + { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); + EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) @@ -960,17 +1042,35 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) ispart = 0; } if (afile == NULL) - return (NULL); + { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); + return (NULL); + } + + dfile = NULL; + anno = NULL; + data = NULL; + record = NULL; if (ispart) - dfile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"),"r"); + name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else - dfile = fopen(Catenate(db->path,".",track,".data"),"r"); + name = Catenate(db->path,".",track,".data"); + if (name == NULL) + goto error; + dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } if (fread(&size,sizeof(int),1,afile) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (size <= 0) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } if (ispart) { ureads = ((int *) (db->reads))[-1]; @@ -983,16 +1083,16 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) if (db->trimmed) { if (tracklen != treads) - { fprintf(stderr,"%s: Track %s not same size as database !\n",Prog_Name,track); - exit (1); + { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->tfirst,SEEK_CUR); } else { if (tracklen != ureads) - { fprintf(stderr,"%s: Track %s not same size as database !\n",Prog_Name,track); - exit (1); + { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); @@ -1000,25 +1100,19 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); - - if (size > 0) - { if (dfile == NULL) - { if (fread(anno,size,nreads,afile) != (size_t) nreads) - SYSTEM_ERROR - } - else - { if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) - SYSTEM_ERROR - } - } - else - SYSTEM_ERROR + if (anno == NULL) + goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; + if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; @@ -1041,19 +1135,33 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } + if (data == NULL) + goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); + goto error; + } } fclose(dfile); + dfile = NULL; } else - data = NULL; + { if (fread(anno,size,nreads,afile) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + data = NULL; + } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); + if (record == NULL) + goto error; record->name = Strdup(track,"Allocating Track Name"); + if (record->name == NULL) + goto error; record->data = data; record->anno = anno; record->size = size; @@ -1068,6 +1176,18 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) } return (record); + +error: + if (record == NULL) + free(record); + if (data != NULL) + free(data); + if (anno != NULL) + free(anno); + if (dfile != NULL) + fclose(dfile); + fclose(afile); + EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) @@ -1106,7 +1226,7 @@ char *New_Read_Buffer(HITS_DB *db) read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) - exit (1); + EXIT(NULL); return (read+1); } @@ -1116,20 +1236,21 @@ char *New_Read_Buffer(HITS_DB *db) // // **NB**, the byte before read will be set to a delimiter character! -void Load_Read(HITS_DB *db, int i, char *read, int ascii) +int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; + if (i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(1); + } if (bases == NULL) - { db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r")); + { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) - exit (1); - } - if (i >= db->nreads) - { fprintf(stderr,"%s: Index out of bounds (Load_Read)\n",Prog_Name); - exit (1); + EXIT(1); + db->bases = (void *) bases; } off = r[i].boff; @@ -1140,7 +1261,9 @@ void Load_Read(HITS_DB *db, int i, char *read, int ascii) clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(1); + } } Uncompress_Read(len,read); if (ascii == 1) @@ -1153,6 +1276,7 @@ void Load_Read(HITS_DB *db, int i, char *read, int ascii) } else read[-1] = 4; + return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) @@ -1162,14 +1286,15 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) int bbeg, bend; HITS_READ *r = db->reads; + if (i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(NULL); + } if (bases == NULL) - { db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r")); + { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) - exit (1); - } - if (i >= db->nreads) - { fprintf(stderr,"%s: Index out of bounds (Load_Read)\n",Prog_Name); - exit (1); + EXIT(NULL); + db->bases = (void *) bases; } bbeg = beg/4; @@ -1183,7 +1308,9 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(NULL); + } } Uncompress_Read(4*clen,read); read += beg%4; @@ -1219,7 +1346,7 @@ char **New_QV_Buffer(HITS_DB *db) qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) - exit (1); + EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); @@ -1228,22 +1355,22 @@ char **New_QV_Buffer(HITS_DB *db) // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. -void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) +int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) - { fprintf(stderr,"%s: QV's are not loaded!\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); + EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) - { fprintf(stderr,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); + EXIT(1); } reads = db->reads; @@ -1251,7 +1378,8 @@ void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); - Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen); + if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) + EXIT(1); if (ascii != 1) { char *deltag = entry[1]; @@ -1270,6 +1398,8 @@ void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) deltag[j] = (char) (deltag[j]+u); } } + + return (0); } @@ -1285,8 +1415,8 @@ void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). -void Read_All_Sequences(HITS_DB *db, int ascii) -{ FILE *bases = (FILE *) db->bases; +int Read_All_Sequences(HITS_DB *db, int ascii) +{ FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); @@ -1295,12 +1425,15 @@ void Read_All_Sequences(HITS_DB *db, int ascii) int64 o, off; int i, len, clen; + bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) - db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r")); - else - rewind(bases); + EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); + if (seq == NULL) + { fclose(bases); + EXIT(1); + } *seq++ = 4; @@ -1318,7 +1451,11 @@ void Read_All_Sequences(HITS_DB *db, int ascii) clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); + free(seq); + fclose(bases); + EXIT(1); + } } Uncompress_Read(len,seq+o); if (ascii) @@ -1332,9 +1469,11 @@ void Read_All_Sequences(HITS_DB *db, int ascii) db->bases = (void *) seq; db->loaded = 1; + + return (0); } -int List_DB_Files(char *path, void foreach(char *path, char *extension)) +int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; @@ -1351,13 +1490,15 @@ int List_DB_Files(char *path, void foreach(char *path, char *extension)) rlen = strlen(root); if (root == NULL || pwd == NULL) - { status = 1; - goto exit; + { free(pwd); + free(root); + EXIT(1); } if ((dirp = opendir(pwd)) == NULL) - { status = 1; - goto exit; + { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); + status = -1; + goto error; } isdam = 0; @@ -1380,15 +1521,16 @@ int List_DB_Files(char *path, void foreach(char *path, char *extension)) } } if (dp == NULL) - { status = 1; + { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); + status = -1; closedir(dirp); - goto exit; + goto error; } if (isdam) - foreach(Catenate(pwd,"/",root,".dam"),"dam"); + actor(Catenate(pwd,"/",root,".dam"),"dam"); else - foreach(Catenate(pwd,"/",root,".db"),"db"); + actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) @@ -1406,11 +1548,11 @@ int List_DB_Files(char *path, void foreach(char *path, char *extension)) continue; if (strncmp(name,root,rlen) != 0) continue; - foreach(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); + actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); -exit: +error: free(pwd); free(root); return (status); diff --git a/DB.h b/DB.h index 471c84a..567a91a 100644 --- a/DB.h +++ b/DB.h @@ -57,6 +57,36 @@ #include "QV.h" +#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" + // Undefine if you don't want this + +// For interactive applications where it is inappropriate to simply exit with an error +// message to standard error, define the constant INTERACTIVE. If set, then error +// messages are put in the global variable Ebuffer and the caller of a DB routine +// can decide how to deal with the error. +// +// DB, QV, or alignment routines that can encounter errors function as before in +// non-INTERACTIVE mode by exiting after printing an error message to stderr. In +// INTERACTIVE mode the routines place a message at EPLACE and return an error +// value. For such routines that were previously void, they are now int, and +// return 1 if an error occured, 0 otherwise. + +#undef INTERACTIVE + +#ifdef INTERACTIVE + +#define EPRINTF sprintf +#define EPLACE Ebuffer +#define EXIT(x) return (x) + +#else // BATCH + +#define EPRINTF fprintf +#define EPLACE stderr +#define EXIT(x) exit (1) + +#endif + typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; @@ -68,9 +98,6 @@ typedef signed long long int64; typedef float float32; typedef double float64; -#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" - // Undefine if you don't want this - /******************************************************************************************* * @@ -80,8 +107,14 @@ typedef double float64; extern char *Prog_Name; // Name of program +#ifdef INTERACTIVE + +extern char Ebuffer[]; + +#endif + #define SYSTEM_ERROR \ - { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } @@ -291,7 +324,7 @@ typedef struct // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: - // -1: The DB could not be opened for a reason reported by the routine to stderr + // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap @@ -309,10 +342,12 @@ void Trim_DB(HITS_DB *db); void Close_DB(HITS_DB *db); - // If QV pseudo track is not already in db's track list, then set it up and return a pointer - // to it. The database must not have been trimmed yet. + // If QV pseudo track is not already in db's track list, then load it and set it up. + // The database must not have been trimmed yet. -1 is returned if a .qvs file is not + // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE + // is defined. Otherwise a 0 is returned. -void Load_QVs(HITS_DB *db); +int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. @@ -329,7 +364,9 @@ int Check_Track(HITS_DB *db, char *track); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be - // opened for some reason, then NULL is returned. + // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise + // the routine prints an error message to stderr and exits if an error occurs, and returns + // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); @@ -340,27 +377,31 @@ void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte - // are needed by the alignment algorithms. + // are needed by the alignment algorithms. If cannot allocate memory then return NULL + // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter - // for traversals in either direction. + // for traversals in either direction. A non-zero value is returned if an error occured + // and INTERACTIVE is defined. -void Load_Read(HITS_DB *db, int i, char *read, int ascii); +int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. + // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur - // in the database. + // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, + // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters @@ -371,27 +412,31 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters - // are converted to a numeric or upper/lower case ascii string as per ascii. + // are converted to a numeric or upper/lower case ascii string as per ascii. Return with + // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. -void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); +int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). + // Return with a zero, except when an error occurs and INTERACTIVE is defined in which + // case return wtih 1. -void Read_All_Sequences(HITS_DB *db, int ascii); +int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all - // those of the form "prefix/[.]root.part" and call foreach with the complete path to each file + // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.db" and extension "db". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. - // Return non-zero iff path could not be opened for any reason. + // -1 is returned if the path could not be found, and 1 is returned if an error (reported + // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. -int List_DB_Files(char *path, void foreach(char *path, char *extension)); +int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB diff --git a/QV.c b/QV.c index ffbbf5c..38f6db4 100644 --- a/QV.c +++ b/QV.c @@ -192,7 +192,7 @@ static HScheme *Huffman(uint64 *hist, HScheme *inscheme) scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) - exit (1); + return (NULL); hsize = 0; // Load heap value = 0; @@ -365,22 +365,30 @@ static HScheme *Read_Scheme(FILE *in) scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) - exit (1); + return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); + free(scheme); + return (NULL); + } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); + return (NULL); + } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); + free(scheme); + return (NULL); + } } else bits[i] = 0; @@ -536,7 +544,7 @@ static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int // Read and decode from in, the next rlen symbols into read according to scheme -static void Decode(HScheme *scheme, FILE *in, char *read, int rlen) +static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; @@ -563,33 +571,37 @@ static void Decode(HScheme *scheme, FILE *in, char *read, int rlen) lens = scheme->codelens; look = scheme->lookup; -#define GET \ - if (n > ilen) \ - { icode <<= ilen; \ - if (fread(ipart,sizeof(uint32),1,in) != 1) \ - SYSTEM_ERROR \ - ilen = n-ilen; \ - icode <<= ilen; \ - ilen = 32-ilen; \ - } \ - else \ - { icode <<= n; \ - ilen -= n; \ +#define GET \ + if (n > ilen) \ + { icode <<= ilen; \ + if (fread(ipart,sizeof(uint32),1,in) != 1) \ + { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ + return (1); \ + } \ + ilen = n-ilen; \ + icode <<= ilen; \ + ilen = 32-ilen; \ + } \ + else \ + { icode <<= n; \ + ilen -= n; \ } -#define GETFLIP \ - if (n > ilen) \ - { icode <<= ilen; \ - if (fread(ipart,sizeof(uint32),1,in) != 1) \ - SYSTEM_ERROR \ - Flip_Long(ipart); \ - ilen = n-ilen; \ - icode <<= ilen; \ - ilen = 32-ilen; \ - } \ - else \ - { icode <<= n; \ - ilen -= n; \ +#define GETFLIP \ + if (n > ilen) \ + { icode <<= ilen; \ + if (fread(ipart,sizeof(uint32),1,in) != 1) \ + { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ + return (1); \ + } \ + Flip_Long(ipart); \ + ilen = n-ilen; \ + icode <<= ilen; \ + ilen = 32-ilen; \ + } \ + else \ + { icode <<= n; \ + ilen -= n; \ } n = 16; @@ -619,13 +631,15 @@ static void Decode(HScheme *scheme, FILE *in, char *read, int rlen) } read[j] = (char) c; } + + return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme -static void Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, - int rlen, int rchar) +static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, + int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; @@ -709,6 +723,8 @@ static void Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, read[j] = (char) c; } } + + return (0); } @@ -752,7 +768,7 @@ static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, -static int Rmax = -1; // Compress_Next_QVentry, Uncompress_Next_QVentry +static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan @@ -761,17 +777,21 @@ char *QVentry() // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned -// unless eof occurs in which case return -1. +// unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; + int tmax; + char *tread; char *other; if (Read == NULL) - { Rmax = MIN_BUFFER; - Read = (char *) Malloc(5*Rmax,"Allocating QV entry read buffer"); - if (Read == NULL) - exit (1); + { tmax = MIN_BUFFER; + tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); + if (tread == NULL) + EXIT(-2); + Rmax = tmax; + Read = tread; } Nline += 1; @@ -780,13 +800,15 @@ int Read_Lines(FILE *input, int nlines) rlen = strlen(Read); while (Read[rlen-1] != '\n') - { Rmax = ((int) 1.4*Rmax) + MIN_BUFFER; - Read = (char *) Realloc(Read,5*Rmax,"Reallocating QV entry read buffer"); - if (Read == NULL) - exit (1); + { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; + tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); + if (tread == NULL) + EXIT(-2); + Rmax = tmax; + Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) - { fprintf(stderr,"Line %d: Last line does not end with a newline !\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); + EXIT(-2); } rlen += strlen(Read+rlen); } @@ -795,12 +817,12 @@ int Read_Lines(FILE *input, int nlines) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) - { fprintf(stderr,"Line %d: incomplete last entry of .quiv file\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT(-2); } if (rlen != (int) strlen(other)) - { fprintf(stderr,"Line %d: Lines for an entry are not the same length\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); + EXIT(-2); } } return (rlen-1); @@ -871,7 +893,7 @@ static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding -void QVcoding_Scan(FILE *input) +int QVcoding_Scan(FILE *input) { char *slash; int rlen; @@ -900,29 +922,32 @@ void QVcoding_Scan(FILE *input) { int well, beg, end, qv; rlen = Read_Lines(input,1); + if (rlen == -2) + EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') - { fprintf(stderr,"Line %d: Header in quiv file is missing\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); + EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) - { fprintf(stderr,"%s: Line %d: Header line incorrectly formatted ?\n", + { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); - exit (1); + EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) - { fprintf(stderr,"%s: Line %d: Header line incorrectly formatted ?\n", + { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); - exit (1); + EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) - { fprintf(stderr,"Line %d: incomplete last entry of .quiv file\n",Nline); - exit (1); + { if (rlen == -1) + EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); @@ -956,6 +981,8 @@ void QVcoding_Scan(FILE *input) if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } + + return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write @@ -964,8 +991,16 @@ void QVcoding_Scan(FILE *input) QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; - static HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; - static HScheme *dRunScheme, *sRunScheme; + + HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; + HScheme *dRunScheme, *sRunScheme; + + delScheme = NULL; + dRunScheme = NULL; + insScheme = NULL; + mrgScheme = NULL; + subScheme = NULL; + sRunScheme = NULL; // Check whether using a subtitution run char is a win @@ -996,6 +1031,8 @@ QVcoding *Create_QVcoding(int lossy) #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ + if (scheme == NULL) \ + goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ @@ -1077,6 +1114,21 @@ QVcoding *Create_QVcoding(int lossy) coding.flip = 0; return (&coding); + +error: + if (delScheme != NULL) + free(delScheme); + if (dRunScheme != NULL) + free(dRunScheme); + if (insScheme != NULL) + free(insScheme); + if (mrgScheme != NULL) + free(mrgScheme); + if (subScheme != NULL) + free(subScheme); + if (sRunScheme != NULL) + free(sRunScheme); + EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' @@ -1131,11 +1183,15 @@ QVcoding *Read_QVcoding(FILE *input) int len; if (fread(&half,sizeof(uint16),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); + EXIT(NULL); + } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); + EXIT(NULL); + } if (coding.flip) Flip_Short(&half); coding.delChar = half; @@ -1143,7 +1199,9 @@ QVcoding *Read_QVcoding(FILE *input) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); + EXIT(NULL); + } if (coding.flip) Flip_Short(&half); coding.subChar = half; @@ -1153,15 +1211,19 @@ QVcoding *Read_QVcoding(FILE *input) // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); + EXIT(NULL); + } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) - exit (1); + EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); + EXIT(NULL); + } } coding.prefix[len] = '\0'; } @@ -1172,16 +1234,52 @@ QVcoding *Read_QVcoding(FILE *input) // Read the Huffman schemes used to compress the data - coding.delScheme = Read_Scheme(input); + coding.delScheme = NULL; + coding.dRunScheme = NULL; + coding.insScheme = NULL; + coding.mrgScheme = NULL; + coding.subScheme = NULL; + coding.sRunScheme = NULL; + + coding.delScheme = Read_Scheme(input); + if (coding.delScheme == NULL) + goto error; if (coding.delChar >= 0) - coding.dRunScheme = Read_Scheme(input); - coding.insScheme = Read_Scheme(input); - coding.mrgScheme = Read_Scheme(input); - coding.subScheme = Read_Scheme(input); + { coding.dRunScheme = Read_Scheme(input); + if (coding.dRunScheme == NULL) + goto error; + } + coding.insScheme = Read_Scheme(input); + if (coding.insScheme == NULL) + goto error; + coding.mrgScheme = Read_Scheme(input); + if (coding.mrgScheme == NULL) + goto error; + coding.subScheme = Read_Scheme(input); + if (coding.subScheme == NULL) + goto error; if (coding.subChar >= 0) - coding.sRunScheme = Read_Scheme(input); + { coding.sRunScheme = Read_Scheme(input); + if (coding.sRunScheme == NULL) + goto error; + } return (&coding); + +error: + if (coding.delScheme != NULL) + free(coding.delScheme); + if (coding.dRunScheme != NULL) + free(coding.dRunScheme); + if (coding.insScheme != NULL) + free(coding.insScheme); + if (coding.mrgScheme != NULL) + free(coding.mrgScheme); + if (coding.subScheme != NULL) + free(coding.subScheme); + if (coding.sRunScheme != NULL) + free(coding.sRunScheme); + EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument @@ -1205,15 +1303,16 @@ void Free_QVcoding(QVcoding *coding) * ********************************************************************************************/ -void Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) +int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) - { fprintf(stderr,"Line %d: incomplete last entry of .quiv file\n",Nline); - exit (1); + { if (rlen == -1) + EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT (1); } if (coding->delChar < 0) @@ -1247,45 +1346,61 @@ void Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int loss else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); + + return (0); } -void Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) +int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) - { Decode(coding->delScheme, input, entry[0], rlen); + { if (Decode(coding->delScheme, input, entry[0], rlen)) + EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); + EXIT(1); + } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else - { Decode_Run(coding->delScheme, coding->dRunScheme, input, - entry[0], rlen, coding->delChar); + { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, + entry[0], rlen, coding->delChar)) + EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); + EXIT(1); + } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } - Decode(coding->insScheme, input, entry[2], rlen); + if (Decode(coding->insScheme, input, entry[2], rlen)) + EXIT(1); - Decode(coding->mrgScheme, input, entry[3], rlen); + if (Decode(coding->mrgScheme, input, entry[3], rlen)) + EXIT(1); if (coding->subChar < 0) - Decode(coding->subScheme, input, entry[4], rlen); + { if (Decode(coding->subScheme, input, entry[4], rlen)) + EXIT(1); + } else - Decode_Run(coding->subScheme, coding->sRunScheme, input, - entry[4], rlen, coding->subChar); + { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, + entry[4], rlen, coding->subChar)) + EXIT(1); + } + + return (0); } diff --git a/QV.h b/QV.h index 08c1801..35fbadc 100644 --- a/QV.h +++ b/QV.h @@ -52,6 +52,15 @@ #define _QV_COMPRESSOR + // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or + // batch version of the routines in this library are compiled. In batch mode, routines + // print an error message and exit. In interactive mode, the routines place the error + // message in EPLACE (also defined in DB.h) and return an error value, typically NULL + // if the routine returns a pointer, and an unusual integer value if the routine returns + // an integer. + // Below when an error return is described, one should understand that this value is returned + // only if the routine was compiled in INTERACTIVE mode. + // A PacBio compression scheme typedef struct @@ -68,23 +77,27 @@ typedef struct } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. + // If end-of-input is encountered before any further input, -1 is returned. If there is + // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); - // Read the .quiva file on input and record frequency statistics. + // Read the .quiva file on input and record frequency statistics. If there is an error + // then 1 is returned, otherwise 0. -void QVcoding_Scan(FILE *input); +int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object - // is *statically* allocated within the routine. If lossy is set then use a lossy scaling - // for the insertion and merge streams. + // is *statically allocated within the routine. If lossy is set then use a lossy scaling + // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader - // is *statically* allocated within the routine. + // is *statically* allocated within the routine. If an error occurs while reading then + // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); @@ -95,16 +108,18 @@ void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy - // is set then the scheme is a lossy one. + // is set then the scheme is a lossy one. A non-zero value is return only if an + // error occured. -void Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); +int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); - // Assuming the input is positioned just beyond the compressed encoding of an entry header, + // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically - // provides the length of each of the 5 vectors. + // provides the length of each of the 5 vectors. A non-zero value is return only if an + // error occured. -void Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); +int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR diff --git a/align.c b/align.c index 839609e..4666fe0 100644 --- a/align.c +++ b/align.c @@ -73,7 +73,6 @@ #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_SCRIPT // Show trace additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace -#define SMALL_BIT 100 #undef SHOW_TRACE // Show full trace for Print_Alignment @@ -102,7 +101,7 @@ Work_Data *New_Work_Data() work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) - exit (1); + EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; @@ -114,25 +113,43 @@ Work_Data *New_Work_Data() return ((Work_Data *) work); } -static void enlarge_vector(_Work_Data *work, int newmax) -{ work->vecmax = ((int) (newmax*1.2)) + 10000; - work->vector = Realloc(work->vector,work->vecmax,"Enlarging DP vector"); - if (work->vector == NULL) - exit (1); +static int enlarge_vector(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->vector,max,"Enlarging DP vector"); + if (vec == NULL) + EXIT(1); + work->vecmax = max; + work->vector = vec; + return (0); } -static void enlarge_points(_Work_Data *work, int newmax) -{ work->pntmax = ((int) (newmax*1.2)) + 10000; - work->points = Realloc(work->points,work->pntmax,"Enlarging point vector"); - if (work->points == NULL) - exit (1); +static int enlarge_points(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->points,max,"Enlarging point vector"); + if (vec == NULL) + EXIT(1); + work->pntmax = max; + work->points = vec; + return (0); } -static void enlarge_trace(_Work_Data *work, int newmax) -{ work->tramax = ((int) (newmax*1.2)) + 10000; - work->trace = Realloc(work->trace,work->tramax,"Enlarging trace vector"); - if (work->trace == NULL) - exit (1); +static int enlarge_trace(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->trace,max,"Enlarging trace vector"); + if (vec == NULL) + EXIT(1); + work->tramax = max; + work->trace = vec; + return (0); } void Free_Work_Data(Work_Data *ework) @@ -176,16 +193,6 @@ void Free_Work_Data(Work_Data *ework) static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; - // Micro-Sat Band Parameters - -#define MICRO_SAT 20 - -static int Sat_Width[MICRO_SAT+1] = - { -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; - -#define SAT_LOW .75 -#define SAT_HGH 1.25 - // Adjustable paramters typedef struct @@ -234,7 +241,7 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) - exit (1); + EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; @@ -248,8 +255,9 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) - { fprintf(stderr,"Warning: Base bias worse than 80/20%% !\n"); - bias = 3; + { EPRINTF(EPLACE,"Base bias worse than 80/20%% ! (New_Align_Spec)\n"); + free(spec); + EXIT(NULL); } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); @@ -258,7 +266,9 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) - exit (1); + { free(spec); + EXIT(NULL); + } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); @@ -346,20 +356,25 @@ typedef struct int mark; } Pebble; -static int forward_wave(_Work_Data *work, _Align_Spec *spec, - Alignment *align, Path *bpath, - int mind, int maxd, int mida) +static int VectorEl = 6*sizeof(int) + sizeof(BVEC); + +static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, + int *mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; - int minp, maxp; + int vlen, vmin, vmax; int *V, *M; + int *_V, *_M; BVEC *T; + BVEC *_T; int *HA, *HB; + int *_HA, *_HB; int *NA, *NB; + int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; @@ -376,17 +391,33 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, int more, morem, lasta; int aclip, bclip; - { int alen = align->alen + 1; - int blen = align->blen + 1; - int tlen = alen + blen + 1; + hgh = maxd; + low = *mind; + dif = 0; - V = ((int *) work->vector) + blen; - M = V + tlen; - HA = M + tlen; - HB = HA + tlen; - NA = HB + tlen; - NB = NA + tlen; - T = ((BVEC *) (NB + alen)) + blen; + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEl; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; @@ -400,39 +431,6 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, /* Compute 0-wave starting from mid-line */ - hgh = maxd; - low = mind; - if (aseq == bseq) - { if (low < 0) - { int big = -low; - int sml = -hgh; - - if (big <= MICRO_SAT) - minp = low - Sat_Width[big]; - else - minp = -SAT_HGH*big; - if (sml <= MICRO_SAT) - maxp = hgh + Sat_Width[sml]; - else - maxp = -SAT_LOW*sml; - } - else - { if (low <= MICRO_SAT) - minp = low - Sat_Width[low]; - else - minp = SAT_LOW*low; - if (hgh <= MICRO_SAT) - maxp = hgh + Sat_Width[hgh]; - else - maxp = SAT_HGH*hgh; - } - } - else - { minp = -INT32_MAX; - maxp = INT32_MAX; - } - dif = 0; - more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; @@ -460,7 +458,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; @@ -512,7 +512,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); @@ -530,7 +532,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); @@ -604,6 +608,82 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, int am, ac, ap; char *a; + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move; + int64 vd, md, had, hbd, nad, nbd, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEl)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEl; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (hbd < 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (nbd < 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nbd > 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (hbd > 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + } + if (low > minp) { low -= 1; NA[low] = NA[low+1]; @@ -702,7 +782,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); @@ -724,7 +806,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); @@ -938,27 +1022,29 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, bpath->tlen = btlen; } - work->cells = (void *) cells; - work->celmax = cmax; - - return (low); + *mind = low; + return (0); } /*** Reverse Wave ***/ -static void reverse_wave(_Work_Data *work, _Align_Spec *spec, - Alignment *align, Path *bpath, int mind, int maxd, int mida) +static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, + int mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; - int minp, maxp; + int vlen, vmin, vmax; int *V, *M; + int *_V, *_M; BVEC *T; + BVEC *_T; int *HA, *HB; + int *_HA, *_HB; int *NA, *NB; + int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; @@ -975,17 +1061,33 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, int more, morem, lasta; int aclip, bclip; - { int alen = align->alen + 1; - int blen = align->blen + 1; - int tlen = alen + blen + 1; + hgh = maxd; + low = mind; + dif = 0; - V = ((int *) work->vector) + blen; - M = V + tlen; - HA = M + tlen; - HB = HA + tlen; - NA = HB + tlen; - NB = NA + tlen; - T = ((BVEC *) (NB + alen)) + blen; + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEl; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; @@ -997,39 +1099,6 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, boff = 0; } - hgh = maxd; - low = mind; - if (aseq == bseq) - { if (low < 0) - { int big = -low; - int sml = -hgh; - - if (big <= MICRO_SAT) - minp = low - Sat_Width[big]; - else - minp = -SAT_HGH*big; - if (sml <= MICRO_SAT) - maxp = hgh + Sat_Width[sml]; - else - maxp = -SAT_LOW*sml; - } - else - { if (low <= MICRO_SAT) - minp = low - Sat_Width[low]; - else - minp = SAT_LOW*low; - if (hgh <= MICRO_SAT) - maxp = hgh + Sat_Width[hgh]; - else - maxp = SAT_HGH*hgh; - } - } - else - { minp = -INT32_MAX; - maxp = INT32_MAX; - } - dif = 0; - more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; @@ -1057,7 +1126,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; @@ -1107,7 +1178,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); @@ -1125,7 +1198,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); @@ -1197,6 +1272,81 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, int am, ac, ap; char *a; + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move, vd, md, had, hbd, nad, nbd, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEl)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEl; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (hbd < 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (nbd < 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nbd > 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (hbd > 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + } + if (low > minp) { low -= 1; NA[low] = NA[low+1]; @@ -1295,7 +1445,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); @@ -1316,7 +1468,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); @@ -1586,8 +1740,7 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, bpath->trace = btrace + btlen; } - work->cells = (void *) cells; - work->celmax = cmax; + return (0); } @@ -1596,21 +1749,27 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, - int xlow, int xhgh, int ycnt) + int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; - int alen, blen; + int minp, maxp; + int selfie; - alen = align->alen; - blen = align->blen; + { int alen, blen; + int maxtp, wsize; - { int maxtp, wsize; + alen = align->alen; + blen = align->blen; - wsize = (6*sizeof(int) + sizeof(BVEC))*(alen+blen+3); + if (hgh-low >= 7500) + wsize = VectorEl*(hgh-low+1); + else + wsize = VectorEl*10000; if (wsize >= work->vecmax) - enlarge_vector(work,wsize); + if (enlarge_vector(work,wsize)) + EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); @@ -1618,7 +1777,8 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) - enlarge_points(work,wsize); + if (enlarge_points(work,wsize)) + EXIT(NULL); apath = align->path; bpath = (Path *) work->points; @@ -1631,30 +1791,41 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, printf("\n"); #endif - { int l, h, a; + selfie = (align->aseq == align->bseq); + + if (lbord < 0) + { if (selfie && low >= 0) + minp = 1; + else + minp = -INT32_MAX; + } + else + minp = low-lbord; + if (hbord < 0) + { if (selfie && hgh <= 0) + maxp = -1; + else + maxp = INT32_MAX; + } + else + maxp = hgh+hbord; - l = xlow-ycnt; - h = xhgh-ycnt; - a = (xlow+xhgh)/2+ycnt; - if (a < h) - a = h; - if (a < -l) - a = -l; - if (a > 2*alen - h) - a = 2*alen-h; - if (a > 2*blen + l) - a = 2*blen+l; + if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp)) + EXIT(NULL); - l = forward_wave(work,spec,align,bpath,l,h,a); #ifdef DEBUG_PASSES - printf("F1 (%d-%d,%d) => (%d,%d) %d\n",xlow,xhgh,ycnt,apath->aepos,apath->bepos,apath->diffs); + printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", + (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, + apath->aepos,apath->bepos,apath->diffs); #endif - reverse_wave(work,spec,align,bpath,l,l,a); + if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp)) + EXIT(NULL); + #ifdef DEBUG_PASSES - printf("R1 (%d,%d) => (%d,%d) %d\n",l,ycnt,apath->abpos,apath->bbpos,apath->diffs); + printf("R1 (%d,%d) => (%d,%d) %d\n", + (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); #endif - } if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; @@ -1802,7 +1973,7 @@ int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) - fprintf(stderr," %s: Wrong number of trace points\n",fname); + EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; @@ -1818,7 +1989,7 @@ int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) } if (p != ovl->path.bepos) { if (verbose) - fprintf(stderr," %s: Trace point sum != aligned interval\n",fname); + EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } return (0); @@ -1921,8 +2092,8 @@ void Complement_Seq(char *aseq, int len) static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; -void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, - int indent, int width, int border, int upper, int coord) +int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, + int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; @@ -1937,7 +2108,7 @@ void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int match, diff; char *N2A; - if (trace == NULL) return; + if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); @@ -1947,7 +2118,8 @@ void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, o = sizeof(char)*3*(width+1); if (o > work->vecmax) - enlarge_vector(work,o); + if (enlarge_vector(work,o)) + EXIT(1); if (upper) N2A = ToU; @@ -2157,10 +2329,11 @@ void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, fprintf(file,"\n"); fflush(file); + return (0); } -void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, - int indent, int block, int border, int upper, int coord) +int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, + int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; @@ -2176,7 +2349,7 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, char *N2A; int vmax; - if (trace == NULL) return; + if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); @@ -2187,7 +2360,8 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) - { enlarge_vector(work,3*o); + { if (enlarge_vector(work,3*o)) + EXIT(1); vmax = work->vecmax/3; } @@ -2245,7 +2419,8 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ - { enlarge_vector(work,3*o); \ + { if (enlarge_vector(work,3*o)) \ + EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ @@ -2408,6 +2583,7 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, fprintf(file,"\n"); fflush(file); + return (0); } /* Print an ASCII representation of the overlap in align between fragments @@ -2773,7 +2949,7 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) } -static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) +static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; @@ -2792,13 +2968,15 @@ static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) L = asub; L *= sizeof(int); if (L > work->tramax) - enlarge_trace(work,L); + if (enlarge_trace(work,L)) + EXIT(1); trace = wave.Stop = ((int *) work->trace); D = 2*(path->diffs + 4)*sizeof(int); if (D > work->vecmax) - enlarge_vector(work,D); + if (enlarge_vector(work,D)) + EXIT(1); D = (path->diffs+3)/2; wave.VF = ((int *) work->vector) + (D+1); @@ -2811,6 +2989,7 @@ static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) align->bseq+path->bbpos,path->bepos-path->bbpos,&wave); path->trace = trace; path->tlen = wave.Stop - trace; + return (0); } @@ -2840,7 +3019,8 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) { int *F0, *F1, *F2; int *HF; - int low, hgh, pos; + int low, hgh; + int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); @@ -2862,10 +3042,19 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) { low = del; hgh = 0; } + + posl = -INT32_MAX; + posh = INT32_MAX; if (wave->Aabs == wave->Babs) - pos = B-A; - else - pos = -INT32_MAX; + { if (B == A) + { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); + EXIT(-1); + } + else if (B < A) + posl = (B-A)+1; + else + posh = (B-A)-1; + } F1 = PVF[-2]; F0 = PVF[-1]; @@ -2888,10 +3077,10 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) HF = PHF[D]; if ((D & 0x1) == 0) - { hgh += 1; - low -= 1; - if (low <= pos) - low += 1; + { if (low > posl) + low -= 1; + if (hgh < posh) + hgh += 1; } F0[hgh+1] = F0[low-1] = -2; @@ -3086,7 +3275,8 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) { int *F0, *F1, *F2; int *HF; - int low, hgh, pos; + int low, hgh; + int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); @@ -3108,10 +3298,19 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) { low = del; hgh = 0; } + + posl = -INT32_MAX; + posh = INT32_MAX; if (wave->Aabs == wave->Babs) - pos = B-A; - else - pos = -INT32_MAX; + { if (B == A) + { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); + EXIT(1); + } + else if (B < A) + posl = (B-A)+1; + else + posh = (B-A)-1; + } F1 = PVF[-2]; F0 = PVF[-1]; @@ -3134,10 +3333,10 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) HF = PHF[D]; if ((D & 0x1) == 0) - { hgh += 1; - low -= 1; - if (low <= pos) - low += 1; + { if (low > posl) + low -= 1; + if (hgh < posh) + hgh += 1; } F0[hgh+1] = F0[low-1] = -2; @@ -3246,7 +3445,7 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } - return (1); + return (0); } @@ -3256,13 +3455,13 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) * * \****************************************************************************************/ -void Compute_Trace_ALL(Alignment *align, Work_Data *ework) +int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; - int M, N; + int M, N, D; path = align->path; aseq = align->aseq; @@ -3282,19 +3481,19 @@ void Compute_Trace_ALL(Alignment *align, Work_Data *ework) s = M; s *= sizeof(int); if (s > work->tramax) - enlarge_trace(work,s); + if (enlarge_trace(work,s)) + EXIT(1); dmax = path->diffs - abs(M-N); s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *)); if (s > 256000000) - { Compute_Trace_ND_ALL(align,ework); - return; - } + return (Compute_Trace_ND_ALL(align,ework)); if (s > work->vecmax) - enlarge_vector(work,s); + if (enlarge_vector(work,s)) + EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); @@ -3312,12 +3511,17 @@ void Compute_Trace_ALL(Alignment *align, Work_Data *ework) wave.Aabs = aseq; wave.Babs = bseq; - path->diffs = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave); + D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave); + if (D < 0) + EXIT(1); + path->diffs = D; path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); + + return (0); } -void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) +int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; @@ -3348,7 +3552,8 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) else s = M*sizeof(int); if (s > work->tramax) - enlarge_trace(work,s); + if (enlarge_trace(work,s)) + EXIT(1); nmax = 0; dmax = 0; @@ -3366,7 +3571,8 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) - enlarge_vector(work,s); + if (enlarge_vector(work,s)) + EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); @@ -3384,7 +3590,7 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) wave.Aabs = aseq; wave.Babs = bseq; - { int i; + { int i, d; diffs = 0; ab = path->abpos; @@ -3394,21 +3600,29 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; - diffs += iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + if (d < 0) + EXIT(1); + diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; - diffs += iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + if (d < 0) + EXIT(1); + diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; + + return (0); } -void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) +int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; @@ -3439,7 +3653,8 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) else s = M*sizeof(int); if (s > work->tramax) - enlarge_trace(work,s); + if (enlarge_trace(work,s)) + EXIT(1); nmax = 0; dmax = 0; @@ -3457,7 +3672,8 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) - enlarge_vector(work,s); + if (enlarge_vector(work,s)) + EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); @@ -3475,7 +3691,7 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) wave.Aabs = aseq; wave.Babs = bseq; - { int i; + { int i, d; int as, bs; int af, bf; @@ -3488,28 +3704,42 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) { ae = ae + trace_spacing; be = bb + points[i]; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave)) - { af = wave.mida; - bf = wave.midb; - diffs += iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); - ab = ae; - bb = be; - as = af; - bs = bf; - } - } - ae = path->aepos; - be = path->bepos; - if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave)) - { af = wave.mida; + EXIT(1); + af = wave.mida; bf = wave.midb; - diffs += iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); + if (d < 0) + EXIT(1); + diffs += d; + ab = ae; + bb = be; as = af; bs = bf; } - diffs += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave); + + ae = path->aepos; + be = path->bepos; + + if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave)) + EXIT(1); + af = wave.mida; + bf = wave.midb; + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); + if (d < 0) + EXIT(1); + diffs += d; + as = af; + bs = bf; + + d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave); + if (d < 0) + EXIT(1); + diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; + + return (0); } diff --git a/align.h b/align.h index bc95c12..110cfcc 100644 --- a/align.h +++ b/align.h @@ -58,6 +58,20 @@ #define _A_MODULE +/*** INTERACTIVE vs BATCH version + + The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or + batch version of the routines in this library are compiled. In batch mode, routines + print an error message and exit. In interactive mode, the routines place the error + message in EPLACE (also defined in DB.h) and return an error value, typically NULL + if the routine returns a pointer, and an unusual integer value if the routine returns + an integer. + Below when an error return is described, one should understand that this value is returned + only if the routine was compiled in INTERACTIVE mode. + +***/ + + /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, @@ -157,8 +171,8 @@ void Complement_Seq(char *a, int n); storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls - to the routines. Free_Work_Data frees a Work_Data object and all working storage - held by it. + to the routines. If enough memory for a Work_Data is not available then NULL is returned. + Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; @@ -182,7 +196,8 @@ void Complement_Seq(char *a, int n); If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically - measured function that increases from 1 as the entropy of freq decreases. + measured function that increases from 1 as the entropy of freq decreases. If memory is + unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. @@ -199,19 +214,27 @@ void Complement_Seq(char *a, int n); float *Base_Frequencies (Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in - 'align' subject to the alignment criterion given by the Align_Spec 'spec' that passes - through one of the points '(xlow-xhgh,y)' within the underlying dynamic programming matrix. - The path record of 'align' has its 'trace' filled from the point of view of an overlap between - the aread and the bread. In addition a Path record from the point of view of the bread - versus the aread is returned by the function, with this Path's 'trace' filled in - appropriately. The space for the returned path and the two 'trace's are in the - working storage supplied by the Work_Data packet and this space is reused with each call, - so if one wants to retain the bread-path and the two trace point sequences, then they - must be copied to user-allocated storage before calling the routine again. + 'align' subject to: + + (a) the alignment criterion given by the Align_Spec 'spec', + (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within + the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh + on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), + (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and + (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. + + The path record of 'align' has its 'trace' filled from the point of view of an overlap + between the aread and the bread. In addition a Path record from the point of view of the + bread versus the aread is returned by the function, with this Path's 'trace' filled in + appropriately. The space for the returned path and the two 'trace's are in the working + storage supplied by the Work_Data packet and this space is reused with each call, so if + one wants to retain the bread-path and the two trace point sequences, then they must be + copied to user-allocated storage before calling the routine again. NULL is returned in + the event of an error. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, - int xlow, int xhgh, int y); + int low, int hgh, int anti, int lbord, int hbord); /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment. If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points @@ -228,12 +251,13 @@ void Complement_Seq(char *a, int n); but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of pass through points. It is generally twice as - slow as Compute_Trace_PTS, but it produces nearer optimal alignments. + slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines + return 1 if an error occurred and 0 otherwise. */ - void Compute_Trace_ALL(Alignment *align, Work_Data *work); - void Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing); - void Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing); + int Compute_Trace_ALL(Alignment *align, Work_Data *work); + int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing); + int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing); /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls @@ -253,6 +277,8 @@ void Complement_Seq(char *a, int n); in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. + Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. + Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. @@ -260,10 +286,10 @@ void Complement_Seq(char *a, int n); void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); - void Print_Alignment(FILE *file, Alignment *align, Work_Data *work, + int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); - void Print_Reference(FILE *file, Alignment *align, Work_Data *work, + int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); diff --git a/filter.c b/filter.c index 3205049..db80348 100644 --- a/filter.c +++ b/filter.c @@ -67,13 +67,15 @@ // <= 4 ^ -(kmer-MAX_BIAS) #define MAXGRAM 10000 // Cap on k-mer count histogram (in count_thread, merge_thread) +#define PANEL_SIZE 50000 // Size to break up very long A-reads +#define PANEL_OVERLAP 10000 // Overlap of A-panels + #undef TEST_LSORT #undef TEST_KSORT #undef TEST_PAIRS #undef TEST_CSORT #define HOW_MANY 3000 // Print first HOW_MANY items for each of the TEST options above -#define TEST_HIT // When off, just tuple filtering alone, no check #undef TEST_GATHER #undef SHOW_OVERLAP // Show the cartoon #undef SHOW_ALIGNMENT // Show the alignment @@ -103,7 +105,7 @@ typedef struct } KmerPos; typedef struct - { int bpos; + { int diag; int apos; int aread; int bread; @@ -119,7 +121,7 @@ typedef struct typedef struct { int apos; - int bpos; + int diag; int bread; int aread; } SeedPair; @@ -1069,7 +1071,7 @@ static void *merge_thread(void *arg) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = ar; hits[nhits].apos = ap; - hits[nhits].bpos = bsort[b].rpos; + hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } @@ -1086,7 +1088,7 @@ static void *merge_thread(void *arg) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = ar; hits[nhits].apos = ap; - hits[nhits].bpos = bsort[b].rpos; + hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } @@ -1122,7 +1124,7 @@ static void *merge_thread(void *arg) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = asort[a].read; hits[nhits].apos = ap; - hits[nhits].bpos = bsort[b].rpos; + hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } @@ -1137,6 +1139,37 @@ static void *merge_thread(void *arg) } +void Diagonal_Span(Path *path, int tspace, int *mind, int *maxd) +{ uint16 *points; + int i, tlen; + int dd, low, hgh; + + points = (uint16 *) path->trace; + tlen = path->tlen; + + dd = path->bbpos - path->abpos; + low = hgh = dd; + + dd = path->bepos - path->aepos; + if (dd < low) + low = dd; + else if (dd > hgh) + hgh = dd; + + dd = path->bbpos - (path->abpos/tspace)*tspace; + tlen -= 2; + for (i = 1; i < tlen; i += 2) + { dd += points[i]-tspace; + if (dd < low) + low = dd; + else if (dd > hgh) + hgh = dd; + } + + *mind = (low >> Binshift)-1; + *maxd = (hgh >> Binshift)+1; +} + static HITS_DB *MR_ablock; static HITS_DB *MR_bblock; static SeedPair *MR_hits; @@ -1147,6 +1180,7 @@ typedef struct { int64 beg, end; int *score; int *lastp; + int *lasta; Work_Data *work; Align_Spec *aspec; FILE *ofile1; @@ -1164,22 +1198,23 @@ static void *report_thread(void *arg) HITS_READ *aread = MR_ablock->reads; HITS_READ *bread = MR_bblock->reads; int *score = data->score; + int *scorp = data->score + 1; + int *scorm = data->score - 1; int *lastp = data->lastp; -#ifdef TEST_HIT + int *lasta = data->lasta; Work_Data *work = data->work; -#endif Align_Spec *aspec = data->aspec; FILE *ofile1 = data->ofile1; FILE *ofile2 = data->ofile2; int afirst = MR_ablock->tfirst; int bfirst = MR_bblock->tfirst; + int maxdiag = ( MR_ablock->maxlen >> Binshift); + int mindiag = (-MR_bblock->maxlen >> Binshift); Overlap _ovla, *ovla = &_ovla; Overlap _ovlb, *ovlb = &_ovlb; Alignment _align, *align = &_align; -#ifdef TEST_HIT Path *apath; -#endif Path *bpath = &(ovlb->path); int64 nfilt = 0; int64 ahits = 0; @@ -1188,9 +1223,8 @@ static void *report_thread(void *arg) Double *hitc; int minhit; - - uint64 p, q; - int64 g, h, f, end; + uint64 cpair, npair; + int64 nidx, eidx; // In ovl and align roles of A and B are reversed, as the B sequence must be the // complemented sequence !! @@ -1217,60 +1251,77 @@ static void *report_thread(void *arg) minhit = (Hitmin-1)/Kmer + 1; hitc = hitd + (minhit-1); - end = data->end - minhit; - h = data->beg; - for (p = hitd[h].p2; h < end; p = q) - if (hitc[h].p2 != p) - { h += 1; - while ((q = hitd[h].p2) == p) - h += 1; + eidx = data->end - minhit; + nidx = data->beg; + for (cpair = hitd[nidx].p2; nidx < eidx; cpair = npair) + if (hitc[nidx].p2 != cpair) + { nidx += 1; + while ((npair = hitd[nidx].p2) == cpair) + nidx += 1; } else - { int apos, bpos, diag; - int lasta, lastd; - int ar, br; + { int ar, br; int alen, blen; + int setaln, amark, amark2; + int apos, bpos, diag; + int64 lidx, sidx; + int64 f, h2; - ar = hits[h].aread; - br = hits[h].bread; + ar = hits[nidx].aread; + br = hits[nidx].bread; alen = aread[ar].rlen; blen = bread[br].rlen; if (alen < HGAP_MIN && blen < HGAP_MIN) - { do - q = hitd[++h].p2; - while (q == p); + { nidx += 1; + while ((npair = hitd[nidx].p2) == cpair) + nidx += 1; continue; } - - g = h; - do - { bpos = hits[h].bpos; - apos = hits[h].apos; - diag = (apos - bpos) >> Binshift; - if (apos - lastp[diag] >= Kmer) - score[diag] += Kmer; - else - score[diag] += apos - lastp[diag]; - lastp[diag] = apos; - q = hitd[++h].p2; - } - while (q == p); #ifdef TEST_GATHER - printf("%5d vs %5d : %3lld",br+bfirst,ar+afirst,h-g); + printf("%5d vs %5d : %5d x %5d\n",br+bfirst,ar+afirst,blen,alen); +#endif + setaln = 1; + amark2 = 0; + for (sidx = nidx; hitd[nidx].p2 == cpair; nidx = h2) + { amark = amark2 + PANEL_SIZE; + amark2 = amark - PANEL_OVERLAP; + + h2 = lidx = nidx; + do + { apos = hits[nidx].apos; + npair = hitd[++nidx].p2; + if (apos <= amark2) + h2 = nidx; + } + while (npair == cpair && apos <= amark); + + if (nidx-lidx < minhit) continue; + + for (f = lidx; f < nidx; f++) + { apos = hits[f].apos; + diag = hits[f].diag >> Binshift; + if (apos - lastp[diag] >= Kmer) + score[diag] += Kmer; + else + score[diag] += apos - lastp[diag]; + lastp[diag] = apos; + } + +#ifdef TEST_GATHER + printf(" %6lld upto %6d",nidx-lidx,amark); #endif - lasta = -1; - lastd = -(Kmer+1); - for (f = g; f < h; f++) - { bpos = hits[f].bpos; - apos = hits[f].apos; - diag = apos - bpos; - if ((lastd != diag && apos >= lasta) || (lastd == diag && apos > lasta+Kmer)) - { diag >>= Binshift; - if (score[diag] + score[diag+1] >= Hitmin || score[diag] + score[diag-1] >= Hitmin) - { if (lasta < 0) - { align->bseq = aseq + aread[ar].boff; + for (f = lidx; f < nidx; f++) + { apos = hits[f].apos; + diag = hits[f].diag; + bpos = apos - diag; + diag = diag >> Binshift; + if (apos > lasta[diag] && + (score[diag] + scorp[diag] >= Hitmin || score[diag] + scorm[diag] >= Hitmin)) + { if (setaln) + { setaln = 0; + align->bseq = aseq + aread[ar].boff; align->aseq = bseq + bread[br].boff; align->blen = alen; align->alen = blen; @@ -1280,21 +1331,35 @@ static void *report_thread(void *arg) #ifdef TEST_GATHER else printf("\n "); - if (score[diag-1] > score[diag+1]) + + if (scorm[diag] > scorp[diag]) printf(" %5d.. x %5d.. %5d (%3d)", - bpos,apos,apos-bpos,score[diag]+score[diag-1]); + bpos,apos,apos-bpos,score[diag]+scorm[diag]); else printf(" %5d.. x %5d.. %5d (%3d)", - bpos,apos,apos-bpos,score[diag]+score[diag+1]); + bpos,apos,apos-bpos,score[diag]+scorp[diag]); #endif nfilt += 1; -#ifdef TEST_HIT + apath = Local_Alignment(align,work,aspec,bpos-apos,bpos-apos,bpos+apos,-1,-1); - apath = Local_Alignment(align,work,aspec,bpos,bpos,apos); - lasta = bpath->bepos; - lastd = lasta - bpath->aepos; - if ((apath->aepos - apath->abpos) + (apath->bepos - apath->bbpos) >= MINOVER) + { int low, hgh, be; + + Diagonal_Span(bpath,tspace,&low,&hgh); + if (diag < low) + low = diag; + else if (diag > hgh) + hgh = diag; + be = bpath->bepos; + for (diag = low; diag <= hgh; diag++) + if (be > lasta[diag]) + lasta[diag] = be; +#ifdef TEST_GATHER + printf(" %d - %d @ %d",low,hgh,bpath->bepos); +#endif + } + + if ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos) >= MINOVER) { ovla->path = *apath; if (small) { Compress_TraceTo8(ovla); @@ -1331,26 +1396,34 @@ static void *report_thread(void *arg) #ifdef TEST_GATHER else printf(" No alignment %d", - ((apath->aepos - apath->abpos) + (apath->bepos - apath->bbpos))/2); + ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos))/2); #endif - -#else // TEST_HIT - - break; - -#endif // TEST_HIT } } - } + + for (f = lidx; f < nidx; f++) + { diag = hits[f].diag >> Binshift; + score[diag] = lastp[diag] = 0; + } #ifdef TEST_GATHER - printf("\n"); + printf("\n"); #endif + } + + for (f = sidx; f < nidx; f++) + { int d; - for (; g < h; g++) - { bpos = hits[g].bpos; - apos = hits[g].apos; - diag = (apos - bpos) >> Binshift; - score[diag] = lastp[diag] = 0; + diag = hits[f].diag >> Binshift; + for (d = diag; d <= maxdiag; d++) + if (lasta[d] == 0) + break; + else + lasta[d] = 0; + for (d = diag-1; d >= mindiag; d--) + if (lasta[d] == 0) + break; + else + lasta[d] = 0; } } @@ -1448,9 +1521,9 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, if (VERBOSE) { if (comp) - printf("\nComparing %s to %s\n",aname,bname); - else printf("\nComparing c(%s) to %s\n",aname,bname); + else + printf("\nComparing %s to %s\n",aname,bname); } if (alen == 0 || blen == 0) @@ -1625,7 +1698,7 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, khit[nhits].aread = 0x7fffffff; khit[nhits].bread = 0x7fffffff; - khit[nhits].bpos = 0x7fffffff; + khit[nhits].diag = 0x7fffffff; khit[nhits].apos = 0; #ifdef TEST_CSORT @@ -1660,19 +1733,20 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, } parmr[NTHREADS-1].end = nhits; - w = ((ablock->maxlen >> Binshift) - (-bblock->maxlen >> Binshift)) + 1; - counters = (int *) Malloc(NTHREADS*4*w*sizeof(int),"Allocating diagonal buckets"); + w = ((ablock->maxlen >> Binshift) - ((-bblock->maxlen) >> Binshift)) + 1; + counters = (int *) Malloc(NTHREADS*3*w*sizeof(int),"Allocating diagonal buckets"); if (counters == NULL) exit (1); - for (i = 0; i < 4*w*NTHREADS; i++) + for (i = 0; i < 3*w*NTHREADS; i++) counters[i] = 0; for (i = 0; i < NTHREADS; i++) { if (i == 0) parmr[i].score = counters - ((-bblock->maxlen) >> Binshift); else - parmr[i].score = parmr[i-1].lastp + w; + parmr[i].score = parmr[i-1].lasta + w; parmr[i].lastp = parmr[i].score + w; + parmr[i].lasta = parmr[i].lastp + w; parmr[i].work = New_Work_Data(); parmr[i].aspec = aspec;