RNAlib-2.4.5
|
|
Functions dealing with file formats for RNA sequences, structures, and alignments. More...
Functions dealing with file formats for RNA sequences, structures, and alignments.
Files | |
file | commands.h |
Parse and apply different commands that alter the behavior of secondary structure prediction and evaluation. | |
file | file_formats.h |
Read and write different file formats for RNA sequences, structures. | |
file | file_formats_msa.h |
Functions dealing with file formats for Multiple Sequence Alignments (MSA) | |
file | file_utils.h |
Several utilities for file handling. | |
file | ribo.h |
Parse RiboSum Scoring Matrices for Covariance Scoring of Alignments. | |
Data Structures | |
struct | vrna_command_s |
List element for commands ready for application to a vrna_fold_compound_t. More... | |
Macros | |
#define | VRNA_CMD_PARSE_HC 1U |
Command parse/apply flag indicating hard constraints. More... | |
#define | VRNA_CMD_PARSE_SC 2U |
Command parse/apply flag indicating soft constraints. More... | |
#define | VRNA_CMD_PARSE_UD 4U |
Command parse/apply flag indicating unstructured domains. More... | |
#define | VRNA_CMD_PARSE_SD 8U |
Command parse/apply flag indicating structured domains. More... | |
#define | VRNA_CMD_PARSE_DEFAULTS |
Command parse/apply flag indicating default set of commands. More... | |
#define | VRNA_OPTION_MULTILINE 32U |
Tell a function that an input is assumed to span several lines. More... | |
#define | VRNA_CONSTRAINT_MULTILINE 32U |
parse multiline constraint More... | |
#define | VRNA_FILE_FORMAT_MSA_CLUSTAL 1U |
Option flag indicating ClustalW formatted files. More... | |
#define | VRNA_FILE_FORMAT_MSA_STOCKHOLM 2U |
Option flag indicating Stockholm 1.0 formatted files. More... | |
#define | VRNA_FILE_FORMAT_MSA_FASTA 4U |
Option flag indicating FASTA (Pearson) formatted files. More... | |
#define | VRNA_FILE_FORMAT_MSA_MAF 8U |
Option flag indicating MAF formatted files. More... | |
#define | VRNA_FILE_FORMAT_MSA_MIS 16U |
Option flag indicating most informative sequence (MIS) output. More... | |
#define | VRNA_FILE_FORMAT_MSA_DEFAULT |
Option flag indicating the set of default file formats. More... | |
#define | VRNA_FILE_FORMAT_MSA_NOCHECK 4096U |
Option flag to disable validation of the alignment. More... | |
#define | VRNA_FILE_FORMAT_MSA_UNKNOWN 8192U |
Return flag of vrna_file_msa_detect_format() to indicate unknown or malformatted alignment. More... | |
#define | VRNA_FILE_FORMAT_MSA_APPEND 16384U |
Option flag indicating to append data to a multiple sequence alignment file rather than overwriting it. More... | |
#define | VRNA_FILE_FORMAT_MSA_QUIET 32768U |
Option flag to suppress unnecessary spam messages on stderr More... | |
#define | VRNA_FILE_FORMAT_MSA_SILENT 65536U |
Option flag to completely silence any warnings on stderr More... | |
Typedefs | |
typedef struct vrna_command_s | vrna_cmd_t |
Typename for the command repesenting data structure vrna_command_s. | |
Enumerations | |
enum | vrna_command_e |
Types of commands within a list of vrna_command_s structures. | |
Functions | |
vrna_cmd_t * | vrna_file_commands_read (const char *filename, unsigned int options) |
Extract a list of commands from a command file. More... | |
int | vrna_file_commands_apply (vrna_fold_compound_t *vc, const char *filename, unsigned int options) |
Apply a list of commands from a command file. More... | |
int | vrna_commands_apply (vrna_fold_compound_t *vc, vrna_cmd_t *commands, unsigned int options) |
Apply a list of commands to a vrna_fold_compound_t. More... | |
void | vrna_commands_free (vrna_cmd_t *commands) |
Free memory occupied by a list of commands. More... | |
void | vrna_file_helixlist (const char *seq, const char *db, float energy, FILE *file) |
Print a secondary structure as helix list. More... | |
void | vrna_file_connect (const char *seq, const char *db, float energy, const char *identifier, FILE *file) |
Print a secondary structure as connect table. More... | |
void | vrna_file_bpseq (const char *seq, const char *db, FILE *file) |
Print a secondary structure in bpseq format. More... | |
void | vrna_file_json (const char *seq, const char *db, double energy, const char *identifier, FILE *file) |
Print a secondary structure in jsonformat. More... | |
unsigned int | vrna_file_fasta_read_record (char **header, char **sequence, char ***rest, FILE *file, unsigned int options) |
Get a (fasta) data set from a file or stdin. More... | |
char * | vrna_extract_record_rest_structure (const char **lines, unsigned int length, unsigned int option) |
Extract a dot-bracket structure string from (multiline)character array. More... | |
int | vrna_file_SHAPE_read (const char *file_name, int length, double default_value, char *sequence, double *values) |
Read data from a given SHAPE reactivity input file. More... | |
void | vrna_extract_record_rest_constraint (char **cstruc, const char **lines, unsigned int option) |
Extract a hard constraint encoded as pseudo dot-bracket string. More... | |
unsigned int | read_record (char **header, char **sequence, char ***rest, unsigned int options) |
Get a data record from stdin. More... | |
int | vrna_file_msa_read (const char *filename, char ***names, char ***aln, char **id, char **structure, unsigned int options) |
Read a multiple sequence alignment from file. More... | |
int | vrna_file_msa_read_record (FILE *fp, char ***names, char ***aln, char **id, char **structure, unsigned int options) |
Read a multiple sequence alignment from file handle. More... | |
unsigned int | vrna_file_msa_detect_format (const char *filename, unsigned int options) |
Detect the format of a multiple sequence alignment file. More... | |
int | vrna_file_msa_write (const char *filename, const char **names, const char **aln, const char *id, const char *structure, const char *source, unsigned int options) |
Write multiple sequence alignment file. More... | |
void | vrna_file_copy (FILE *from, FILE *to) |
Inefficient `cp'. | |
char * | vrna_read_line (FILE *fp) |
Read a line of arbitrary length from a stream. More... | |
int | vrna_mkdir_p (const char *path) |
Recursivly create a directory tree. | |
char * | vrna_basename (const char *path) |
Extract the filename from a file path. | |
char * | vrna_dirname (const char *path) |
Extract the directory part of a file path. | |
char * | vrna_filename_sanitize (const char *name, const char *replacement) |
Sanitize a file name. More... | |
float ** | readribosum (char *name) |
Read a RiboSum or other user-defined Scoring Matrix and Store into global Memory. | |
struct vrna_command_s |
List element for commands ready for application to a vrna_fold_compound_t.
#define VRNA_CMD_PARSE_HC 1U |
#include <ViennaRNA/commands.h>
Command parse/apply flag indicating hard constraints.
#define VRNA_CMD_PARSE_SC 2U |
#include <ViennaRNA/commands.h>
Command parse/apply flag indicating soft constraints.
#define VRNA_CMD_PARSE_UD 4U |
#include <ViennaRNA/commands.h>
Command parse/apply flag indicating unstructured domains.
#define VRNA_CMD_PARSE_SD 8U |
#include <ViennaRNA/commands.h>
Command parse/apply flag indicating structured domains.
#define VRNA_CMD_PARSE_DEFAULTS |
#include <ViennaRNA/commands.h>
Command parse/apply flag indicating default set of commands.
#define VRNA_OPTION_MULTILINE 32U |
#include <ViennaRNA/file_formats.h>
Tell a function that an input is assumed to span several lines.
If used as input-option a function might also be returning this state telling that it has read data from multiple lines.
#define VRNA_CONSTRAINT_MULTILINE 32U |
#include <ViennaRNA/file_formats.h>
parse multiline constraint
#define VRNA_FILE_FORMAT_MSA_CLUSTAL 1U |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating ClustalW formatted files.
#define VRNA_FILE_FORMAT_MSA_STOCKHOLM 2U |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating Stockholm 1.0 formatted files.
#define VRNA_FILE_FORMAT_MSA_FASTA 4U |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating FASTA (Pearson) formatted files.
#define VRNA_FILE_FORMAT_MSA_MAF 8U |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating MAF formatted files.
#define VRNA_FILE_FORMAT_MSA_MIS 16U |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating most informative sequence (MIS) output.
The default reference sequence output for an alignment is simply a consensus sequence. This flag allows to write the most informative equence (MIS) instead.
#define VRNA_FILE_FORMAT_MSA_DEFAULT |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating the set of default file formats.
#define VRNA_FILE_FORMAT_MSA_NOCHECK 4096U |
#include <ViennaRNA/file_formats_msa.h>
Option flag to disable validation of the alignment.
#define VRNA_FILE_FORMAT_MSA_UNKNOWN 8192U |
#include <ViennaRNA/file_formats_msa.h>
Return flag of vrna_file_msa_detect_format() to indicate unknown or malformatted alignment.
#define VRNA_FILE_FORMAT_MSA_APPEND 16384U |
#include <ViennaRNA/file_formats_msa.h>
Option flag indicating to append data to a multiple sequence alignment file rather than overwriting it.
#define VRNA_FILE_FORMAT_MSA_QUIET 32768U |
#include <ViennaRNA/file_formats_msa.h>
Option flag to suppress unnecessary spam messages on stderr
#define VRNA_FILE_FORMAT_MSA_SILENT 65536U |
#include <ViennaRNA/file_formats_msa.h>
Option flag to completely silence any warnings on stderr
vrna_cmd_t* vrna_file_commands_read | ( | const char * | filename, |
unsigned int | options | ||
) |
#include <ViennaRNA/commands.h>
Extract a list of commands from a command file.
Read a list of commands specified in the input file and return them as list of abstract commands
filename | The filename |
options | Options to limit the type of commands read from the file |
int vrna_file_commands_apply | ( | vrna_fold_compound_t * | vc, |
const char * | filename, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/commands.h>
Apply a list of commands from a command file.
This function is a shortcut to directly parse a commands file and apply all successfully parsed commands to a vrna_fold_compound_t data structure. It is the same as:
vc | The vrna_fold_compound_t the command list will be applied to |
filename | The filename |
options | Options to limit the type of commands read from the file |
int vrna_commands_apply | ( | vrna_fold_compound_t * | vc, |
vrna_cmd_t * | commands, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/commands.h>
Apply a list of commands to a vrna_fold_compound_t.
vc | The vrna_fold_compound_t the command list will be applied to |
commands | The list of commands to apply |
options | Options to limit the type of commands read from the file |
void vrna_commands_free | ( | vrna_cmd_t * | commands | ) |
#include <ViennaRNA/commands.h>
Free memory occupied by a list of commands.
Release memory occupied by a list of commands
commands | A pointer to a list of commands |
void vrna_file_helixlist | ( | const char * | seq, |
const char * | db, | ||
float | energy, | ||
FILE * | file | ||
) |
#include <ViennaRNA/file_formats.h>
Print a secondary structure as helix list.
seq | The RNA sequence |
db | The structure in dot-bracket format |
energy | Free energy of the structure in kcal/mol |
file | The file handle used to print to (print defaults to 'stdout' if(file == NULL) ) |
void vrna_file_connect | ( | const char * | seq, |
const char * | db, | ||
float | energy, | ||
const char * | identifier, | ||
FILE * | file | ||
) |
#include <ViennaRNA/file_formats.h>
Print a secondary structure as connect table.
Connect table file format looks like this:
300 ENERGY = 7.0 example 1 G 0 2 22 1 2 G 1 3 21 2
where the headerline is followed by 6 columns with:
seq | The RNA sequence |
db | The structure in dot-bracket format |
energy | The free energy of the structure |
identifier | An optional identifier for the sequence |
file | The file handle used to print to (print defaults to 'stdout' if(file == NULL) ) |
void vrna_file_bpseq | ( | const char * | seq, |
const char * | db, | ||
FILE * | file | ||
) |
#include <ViennaRNA/file_formats.h>
Print a secondary structure in bpseq format.
seq | The RNA sequence |
db | The structure in dot-bracket format |
file | The file handle used to print to (print defaults to 'stdout' if(file == NULL) ) |
void vrna_file_json | ( | const char * | seq, |
const char * | db, | ||
double | energy, | ||
const char * | identifier, | ||
FILE * | file | ||
) |
#include <ViennaRNA/file_formats.h>
Print a secondary structure in jsonformat.
seq | The RNA sequence |
db | The structure in dot-bracket format |
energy | The free energy |
identifier | An identifier for the sequence |
file | The file handle used to print to (print defaults to 'stdout' if(file == NULL) ) |
unsigned int vrna_file_fasta_read_record | ( | char ** | header, |
char ** | sequence, | ||
char *** | rest, | ||
FILE * | file, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/file_formats.h>
Get a (fasta) data set from a file or stdin.
This function may be used to obtain complete datasets from a filehandle or stdin. A dataset is always defined to contain at least a sequence. If data starts with a fasta header, i.e. a line like
>some header info
then vrna_file_fasta_read_record() will assume that the sequence that follows the header may span over several lines. To disable this behavior and to assign a single line to the argument 'sequence' one can pass VRNA_INPUT_NO_SPAN in the 'options' argument. If no fasta header is read in the beginning of a data block, a sequence must not span over multiple lines!
Unless the options VRNA_INPUT_NOSKIP_COMMENTS or VRNA_INPUT_NOSKIP_BLANK_LINES are passed, a sequence may be interrupted by lines starting with a comment character or empty lines.
A sequence is regarded as completely read if it was either assumed to not span over multiple lines, a secondary structure or structure constraint follows the sequence on the next line, or a new header marks the beginning of a new sequence...
All lines following the sequence (this includes comments) that do not initiate a new dataset according to the above definition are available through the line-array 'rest'. Here one can usually find the structure constraint or other information belonging to the current dataset. Filling of 'rest' may be prevented by passing VRNA_INPUT_NO_REST to the options argument.
The main purpose of this function is to be able to easily parse blocks of data in the header of a loop where all calculations for the appropriate data is done inside the loop. The loop may be then left on certain return values, e.g.:
In the example above, the while loop will be terminated when vrna_file_fasta_read_record() returns either an error, EOF, or a user initiated quit request.
As long as data is read from stdin (we are passing NULL as the file pointer), the id is printed if it is available for the current block of data. The sequence will be printed in any case and if some more lines belong to the current block of data each line will be printed as well.
header | A pointer which will be set such that it points to the header of the record |
sequence | A pointer which will be set such that it points to the sequence of the record |
rest | A pointer which will be set such that it points to an array of lines which also belong to the record |
file | A file handle to read from (if NULL, this function reads from stdin) |
options | Some options which may be passed to alter the behavior of the function, use 0 for no options |
char* vrna_extract_record_rest_structure | ( | const char ** | lines, |
unsigned int | length, | ||
unsigned int | option | ||
) |
#include <ViennaRNA/file_formats.h>
Extract a dot-bracket structure string from (multiline)character array.
This function extracts a dot-bracket structure string from the 'rest' array as returned by vrna_file_fasta_read_record() and returns it. All occurences of comments within the 'lines' array will be skipped as long as they do not break the structure string. If no structure could be read, this function returns NULL.
lines | The (multiline) character array to be parsed |
length | The assumed length of the dot-bracket string (passing a value < 1 results in no length limit) |
option | Some options which may be passed to alter the behavior of the function, use 0 for no options |
int vrna_file_SHAPE_read | ( | const char * | file_name, |
int | length, | ||
double | default_value, | ||
char * | sequence, | ||
double * | values | ||
) |
#include <ViennaRNA/file_formats.h>
Read data from a given SHAPE reactivity input file.
This function parses the informations from a given file and stores the result in the preallocated string sequence and the double array values.
file_name | Path to the constraints file |
length | Length of the sequence (file entries exceeding this limit will cause an error) |
default_value | Value for missing indices |
sequence | Pointer to an array used for storing the sequence obtained from the SHAPE reactivity file |
values | Pointer to an array used for storing the values obtained from the SHAPE reactivity file |
void vrna_extract_record_rest_constraint | ( | char ** | cstruc, |
const char ** | lines, | ||
unsigned int | option | ||
) |
#include <ViennaRNA/file_formats.h>
Extract a hard constraint encoded as pseudo dot-bracket string.
cstruc | A pointer to a character array that is used as pseudo dot-bracket output |
lines | A 2-dimensional character array with the extension lines from the FASTA input |
option | The option flags that define the behavior and recognition pattern of this function |
unsigned int read_record | ( | char ** | header, |
char ** | sequence, | ||
char *** | rest, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/file_formats.h>
Get a data record from stdin.
vrna_file_msa_read | ( | const char * | filename, |
char *** | names, | ||
char *** | aln, | ||
char ** | id, | ||
char ** | structure, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/file_formats_msa.h>
Read a multiple sequence alignment from file.
This function reads the (first) multiple sequence alignment from an input file. The read alignment is split into the sequence id/name part and the actual sequence information and stored in memory as arrays of ids/names and sequences. If the alignment file format allows for additional information, such as an ID of the entire alignment or consensus structure information, this data is retrieved as well and made available. The options
parameter allows to specify the set of alignment file formats that should be used to retrieve the data. If 0 is passed as option, the list of alignment file formats defaults to VRNA_FILE_FORMAT_MSA_DEFAULT.
Currently, the list of parsable multiple sequence alignment file formats consists of:
options
parameter.names
, aln
, id
, and structure
after calling this function. The function automatically sets the latter two arguments to NULL
in case no corresponding data could be retrieved from the input alignment.filename | The name of input file that contains the alignment |
names | An address to the pointer where sequence identifiers should be written to |
aln | An address to the pointer where aligned sequences should be written to |
id | An address to the pointer where the alignment ID should be written to (Maybe NULL) |
structure | An address to the pointer where consensus structure information should be written to (Maybe NULL) |
options | Options to manipulate the behavior of this function |
In the target scripting language, only the first and last argument, filename
and options
, are passed to the corresponding function. The other arguments, which serve as output in the C-library, are available as additional return values. Hence, a function call in python may look like this:
After successfully reading the first record, the variable num_seq
contains the number of sequences in the alignment (the actual return value of the C-function), while the variables names
, aln
, id
, and structure
are lists of the sequence names and aligned sequences, as well as strings holding the alignment ID and the structure as stated in the SS_cons
line, respectively. Note, the last two return values may be empty strings in case the alignment does not provide the required data.
This function exists as an overloaded version where the options
parameter may be omitted! In that case, the options
parameter defaults to VRNA_FILE_FORMAT_MSA_STOCKHOLM.
vrna_file_msa_read_record | ( | FILE * | fp, |
char *** | names, | ||
char *** | aln, | ||
char ** | id, | ||
char ** | structure, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/file_formats_msa.h>
Read a multiple sequence alignment from file handle.
Similar to vrna_file_msa_read(), this function reads a multiple sequence alignment from an input file handle. Since using a file handle, this function is not limited to the first alignment record, but allows for looping over all alignments within the input.
The read alignment is split into the sequence id/name part and the actual sequence information and stored in memory as arrays of ids/names and sequences. If the alignment file format allows for additional information, such as an ID of the entire alignment or consensus structure information, this data is retrieved as well and made available. The options
parameter allows to specify the alignment file format used to retrieve the data. A single format must be specified here, see vrna_file_msa_detect_format() for helping to determine the correct MSA file format.
Currently, the list of parsable multiple sequence alignment file formats consists of:
options
parameter.names
, aln
, id
, and structure
after calling this function. The function automatically sets the latter two arguments to NULL
in case no corresponding data could be retrieved from the input alignment.fp | The file pointer the data will be retrieved from |
names | An address to the pointer where sequence identifiers should be written to |
aln | An address to the pointer where aligned sequences should be written to |
id | An address to the pointer where the alignment ID should be written to (Maybe NULL) |
structure | An address to the pointer where consensus structure information should be written to (Maybe NULL) |
options | Options to manipulate the behavior of this function |
In the target scripting language, only the first and last argument, fp
and options
, are passed to the corresponding function. The other arguments, which serve as output in the C-library, are available as additional return values. Hence, a function call in python may look like this:
After successfully reading the first record, the variable num_seq
contains the number of sequences in the alignment (the actual return value of the C-function), while the variables names
, aln
, id
, and structure
are lists of the sequence names and aligned sequences, as well as strings holding the alignment ID and the structure as stated in the SS_cons
line, respectively. Note, the last two return values may be empty strings in case the alignment does not provide the required data.
This function exists as an overloaded version where the options
parameter may be omitted! In that case, the options
parameter defaults to VRNA_FILE_FORMAT_MSA_STOCKHOLM.
vrna_file_msa_detect_format | ( | const char * | fn, |
unsigned int | options | ||
) |
#include <ViennaRNA/file_formats_msa.h>
Detect the format of a multiple sequence alignment file.
This function attempts to determine the format of a file that supposedly contains a multiple sequence alignment (MSA). This is useful in cases where a MSA file contains more than a single record and therefore vrna_file_msa_read() can not be applied, since it only retrieves the first. Here, one can try to guess the correct file format using this function and then loop over the file, record by record using one of the low-level record retrieval functions for the corresponding MSA file format.
filename | The name of input file that contains the alignment |
options | Options to manipulate the behavior of this function |
options
parameter may be omitted! In that case, the options
parameter defaults to VRNA_FILE_FORMAT_MSA_DEFAULT. vrna_file_msa_write | ( | const char * | filename, |
const char ** | names, | ||
const char ** | aln, | ||
const char * | id, | ||
const char * | structure, | ||
const char * | source, | ||
unsigned int | options | ||
) |
#include <ViennaRNA/file_formats_msa.h>
Write multiple sequence alignment file.
filename | The output filename |
names | The array of sequence names / identifies |
aln | The array of aligned sequences |
id | An optional ID for the alignment |
structure | An optional consensus structure |
source | A string describing the source of the alignment |
options | Options to manipulate the behavior of this function |
options
parameter is missing the options default to (VRNA_FILE_FORMAT_MSA_STOCKHOLM | VRNA_FILE_FORMAT_MSA_APPEND). char* vrna_read_line | ( | FILE * | fp | ) |
#include <ViennaRNA/file_utils.h>
Read a line of arbitrary length from a stream.
Returns a pointer to the resulting string. The necessary memory is allocated and should be released using free() when the string is no longer needed.
fp | A file pointer to the stream where the function should read from |
char* vrna_filename_sanitize | ( | const char * | name, |
const char * | replacement | ||
) |
#include <ViennaRNA/file_utils.h>
Sanitize a file name.
Returns a new file name where all invalid characters are substituted by a replacement character. If no replacement character is supplied, invalid characters are simply removed from the filename. File names may also never exceed a length of 255 characters. Longer file names will undergo a 'smart' truncation process, where the filenames` suffix, i.e. everything after the last dot '.', is attempted to be kept intact. Hence, only the filename part before the suffix is reduced in such a way that the total filename complies to the length restriction of 255 characters. If no suffix is present or the suffix itself already exceeds the maximum length, the filename is simply truncated from the back of the string.
For now we consider the following characters invalid:
Furthermore, the (resulting) file name must not be a reserved file name, such as:
name | The input file name |
replacment | The replacement character, or NULL |