/usr/include/biosquid/msa.h is in biosquid-dev 1.9g+cvs20050121-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 | /*****************************************************************
* @LICENSE@
*****************************************************************/
#ifndef SQUID_MSA_INCLUDED
#define SQUID_MSA_INCLUDED
/* msa.h
* SRE, Mon May 17 10:24:30 1999
*
* Header file for SQUID's multiple sequence alignment
* manipulation code.
*
* RCS $Id: msa.h,v 1.14 2004/02/04 14:44:03 eddy Exp $
*/
#include "squidconf.h"
#include <stdio.h> /* FILE support */
#include "gki.h" /* hash table support */
#include "ssi.h" /* sequence file index support */
#include "squid.h" /* need SQINFO */
/****************************************************
* Obsolete alignment information, AINFO
* Superceded by MSA structure further below; but we
* need AINFO for the near future for backwards
* compatibility.
****************************************************/
/* Structure: aliinfo_s
*
* Purpose: Optional information returned from an alignment file.
*
* flags: always used. Flags for which info is valid/alloced.
*
* alen: mandatory. Alignments are always flushed right
* with gaps so that all aseqs are the same length, alen.
* Available for all alignment formats.
*
* nseq: mandatory. Aligned seqs are indexed 0..nseq-1.
*
* wgt: 0..nseq-1 vector of sequence weights. Mandatory.
* If not explicitly set, weights are initialized to 1.0.
*
* cs: 0..alen-1, just like the alignment. Contains single-letter
* secondary structure codes for consensus structure; "<>^+"
* for RNA, "EHL." for protein. May be NULL if unavailable
* from seqfile. Only available for SELEX format files.
*
* rf: 0..alen-1, just like the alignment. rf is an arbitrary string
* of characters, used for annotating columns. Blanks are
* interpreted as non-canonical columns and anything else is
* considered canonical. Only available from SELEX files.
*
* sqinfo: mandatory. Array of 0..nseq-1
* per-sequence information structures, carrying
* name, id, accession, coords.
*
*/
struct aliinfo_s {
int flags; /* flags for what info is valid */
int alen; /* length of alignment (columns) */
int nseq; /* number of seqs in alignment */
float *wgt; /* sequence weights [0..nseq-1] */
char *cs; /* consensus secondary structure string */
char *rf; /* reference coordinate system */
struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */
/* Pfam/HMMER pick-ups */
char *name; /* name of alignment */
char *desc; /* description of alignment */
char *acc; /* accession of alignment */
char *au; /* "author" information */
float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */
float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */
float ga1, ga2; /* gathering cutoffs */
};
typedef struct aliinfo_s AINFO;
#define AINFO_TC (1 << 0)
#define AINFO_NC (1 << 1)
#define AINFO_GA (1 << 2)
/*****************************************************************
* MSA
* SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland]
*
* Defines the new data structure and API for multiple
* sequence alignment i/o.
*****************************************************************/
/* The following constants define the Pfam/Rfam cutoff set we'll propagate
* from msa's into HMMER and Infernal models.
*/
#define MSA_CUTOFF_TC1 0
#define MSA_CUTOFF_TC2 1
#define MSA_CUTOFF_GA1 2
#define MSA_CUTOFF_GA2 3
#define MSA_CUTOFF_NC1 4
#define MSA_CUTOFF_NC2 5
#define MSA_MAXCUTOFFS 6
/* Structure: MSA
* SRE, Tue May 18 11:33:08 1999
*
* Our object for a multiple sequence alignment.
*/
typedef struct msa_struct {
/* Mandatory information associated with the alignment.
*/
char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */
char **sqname; /* names of sequences, [0..nseq-1][] */
float *wgt; /* sequence weights [0..nseq-1] */
int alen; /* length of alignment (columns) */
int nseq; /* number of seqs in alignment */
/* Optional information that we understand, and might have.
*/
int flags; /* flags for what optional info is valid */
int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */
char *name; /* name of alignment, or NULL */
char *desc; /* description of alignment, or NULL */
char *acc; /* accession of alignment, or NULL */
char *au; /* "author" information, or NULL */
char *ss_cons; /* consensus secondary structure string, or NULL */
char *sa_cons; /* consensus surface accessibility string, or NULL */
char *rf; /* reference coordinate system, or NULL */
char **sqacc; /* accession numbers for individual sequences */
char **sqdesc; /* description lines for individual sequences */
char **ss; /* per-seq secondary structure annotation, or NULL */
char **sa; /* per-seq surface accessibility annotation, or NULL */
float cutoff[MSA_MAXCUTOFFS]; /* NC, TC, GA cutoffs propagated to Pfam/Rfam */
int cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */
/* Optional information that we don't understand.
* That is, we know what type of information it is, but it's
* either (interpreted as) free-text comment, or it's Stockholm
* markup with unfamiliar tags.
*/
char **comment; /* free text comments, or NULL */
int ncomment; /* number of comment lines */
int alloc_ncomment; /* number of comment lines alloc'ed */
char **gf_tag; /* markup tags for unparsed #=GF lines */
char **gf; /* annotations for unparsed #=GF lines */
int ngf; /* number of unparsed #=GF lines */
int alloc_ngf; /* number of gf lines alloc'ed */
char **gs_tag; /* markup tags for unparsed #=GS lines */
char ***gs; /* [0..ngs-1][0..nseq-1][free text] markup */
GKI *gs_idx; /* hash of #=GS tag types */
int ngs; /* number of #=GS tag types */
char **gc_tag; /* markup tags for unparsed #=GC lines */
char **gc; /* [0..ngc-1][0..alen-1] markup */
GKI *gc_idx; /* hash of #=GC tag types */
int ngc; /* number of #=GC tag types */
char **gr_tag; /* markup tags for unparsed #=GR lines */
char ***gr; /* [0..ngr][0..nseq-1][0..alen-1] markup */
GKI *gr_idx; /* hash of #=GR tag types */
int ngr; /* number of #=GR tag types */
/* Stuff we need for our own maintenance of the data structure
*/
GKI *index; /* name ->seqidx hash table */
int nseqalloc; /* number of seqs currently allocated for */
int nseqlump; /* lump size for dynamic expansions of nseq */
int *sqlen; /* individual sequence lengths during parsing */
int *sslen; /* individual ss lengths during parsing */
int *salen; /* individual sa lengths during parsing */
int lastidx; /* last index we saw; use for guessing next */
} MSA;
#define MSA_SET_WGT (1 << 0) /* track whether wgts were set, or left at default 1.0 */
/* Structure: MSAFILE
* SRE, Tue May 18 11:36:54 1999
*
* Defines an alignment file that's open for reading.
*/
typedef struct msafile_struct {
FILE *f; /* open file pointer */
char *fname; /* name of file. used for diagnostic output */
int linenumber; /* what line are we on in the file */
char *buf; /* buffer for line input w/ sre_fgets() */
int buflen; /* current allocated length for buf */
SSIFILE *ssi; /* open SSI index file; or NULL, if none. */
int do_gzip; /* TRUE if f is a pipe from gzip -dc (need pclose(f)) */
int do_stdin; /* TRUE if f is stdin (don't close f, not our problem) */
int format; /* format of alignment file we're reading */
} MSAFILE;
/* Alignment file formats.
* Must coexist with sqio.c/squid.h unaligned file format codes.
* Rules:
* - 0 is an unknown/unassigned format
* - <100 reserved for unaligned formats
* - >100 reserved for aligned formats
*/
#define MSAFILE_UNKNOWN 0 /* unknown format */
#define MSAFILE_STOCKHOLM 101 /* Pfam/HMMER's Stockholm format */
#define MSAFILE_SELEX 102 /* Obsolete(!): old HMMER/SELEX format */
#define MSAFILE_MSF 103 /* GCG MSF format */
#define MSAFILE_CLUSTAL 104 /* Clustal V/W format */
#define MSAFILE_A2M 105 /* aligned FASTA (A2M is UCSC terminology) */
#define MSAFILE_PHYLIP 106 /* Felsenstein's PHYLIP format */
#define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */
#define IsAlignmentFormat(fmt) ((fmt) > 100)
/* from msa.c
*/
extern MSAFILE *MSAFileOpen(char *filename, int format, char *env);
extern MSA *MSAFileRead(MSAFILE *afp);
extern void MSAFileClose(MSAFILE *afp);
extern void MSAFree(MSA *msa);
extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline);
extern int MSAFileRewind(MSAFILE *afp);
extern int MSAFilePositionByKey(MSAFILE *afp, char *key);
extern int MSAFilePositionByIndex(MSAFILE *afp, int idx);
extern int MSAFileFormat(MSAFILE *afp);
extern MSA *MSAAlloc(int nseq, int alen);
extern void MSAExpand(MSA *msa);
extern char *MSAFileGetLine(MSAFILE *afp);
extern void MSASetSeqAccession(MSA *msa, int seqidx, char *acc);
extern void MSASetSeqDescription(MSA *msa, int seqidx, char *desc);
extern void MSAAddComment(MSA *msa, char *s);
extern void MSAAddGF(MSA *msa, char *tag, char *value);
extern void MSAAddGS(MSA *msa, char *tag, int seqidx, char *value);
extern void MSAAppendGC(MSA *msa, char *tag, char *value);
extern char *MSAGetGC(MSA *msa, char *tag);
extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value);
extern void MSAVerifyParse(MSA *msa);
extern int MSAGetSeqidx(MSA *msa, char *name, int guess);
extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo);
extern void MSAMingap(MSA *msa);
extern void MSANogap(MSA *msa);
extern void MSAShorterAlignment(MSA *msa, int *useme);
extern void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new);
extern char *MSAGetSeqAccession(MSA *msa, int idx);
extern char *MSAGetSeqDescription(MSA *msa, int idx);
extern char *MSAGetSeqSS(MSA *msa, int idx);
extern char *MSAGetSeqSA(MSA *msa, int idx);
extern float MSAAverageSequenceLength(MSA *msa);
/* from a2m.c
*/
extern MSA *ReadA2M(MSAFILE *afp);
extern void WriteA2M(FILE *fp, MSA *msa);
/* from clustal.c
*/
extern MSA *ReadClustal(MSAFILE *afp);
extern void WriteClustal(FILE *fp, MSA *msa);
/* from eps.c
*/
extern void EPSWriteSmallMSA(FILE *fp, MSA *msa);
/* from msf.c
*/
extern MSA *ReadMSF(MSAFILE *afp);
extern void WriteMSF(FILE *fp, MSA *msa);
/* from phylip.c
*/
extern MSA *ReadPhylip(MSAFILE *afp);
extern void WritePhylip(FILE *fp, MSA *msa);
/* from selex.c
*/
extern MSA *ReadSELEX(MSAFILE *afp);
extern void WriteSELEX(FILE *fp, MSA *msa);
extern void WriteSELEXOneBlock(FILE *fp, MSA *msa);
/* from stockholm.c
*/
extern MSA *ReadStockholm(MSAFILE *afp);
extern void WriteStockholm(FILE *fp, MSA *msa);
extern void WriteStockholmOneBlock(FILE *fp, MSA *msa);
#endif /*SQUID_MSA_INCLUDED*/
|