Generated on Wed Apr 29 2015 11:51:40 for GGL-4.1.2 by doxygen 1.8.3.1
SMILES_grammar.hh
Go to the documentation of this file.
1 
2 #ifndef GGL_CHEM_SMILES_GRAMMAR_HH_
3 #define GGL_CHEM_SMILES_GRAMMAR_HH_
4 
5 #include "sgm/HashMap.hh"
6 #if HAVE_UNORDERED_MAP > 0
7  #include <unordered_map>
8 #elif HAVE_TR1_UNORDERED_MAP > 0
9  #include <tr1/unordered_map>
10 #elif HAVE_GNU_HASH_MAP > 0
11  #include <ext/hash_map>
12 #else
13  #include <map>
14 #endif
15 
16 #include <utility>
17 #include <vector>
18 #include <locale>
19 
20 
21  // set spirit closure limit if neccessary
22 #if !defined(BOOST_SPIRIT_CLOSURE_LIMIT)
23 #define BOOST_SPIRIT_CLOSURE_LIMIT 5
24 #elif BOOST_SPIRIT_CLOSURE_LIMIT < 5
25 #error "GGL_CHEM_SMILES_GRAMMAR : BOOST_SPIRIT_CLOSURE_LIMIT too low, has to be at least 5"
26 #endif
27 
28  // set phoenix limit if neccessary
29 #if !defined(PHOENIX_LIMIT)
30 #define PHOENIX_LIMIT 5
31 #elif PHOENIX_LIMIT < 5
32 #error "GGL_CHEM_SMILES_GRAMMAR : PHOENIX_LIMIT too low, has to be at least 5"
33 #endif
34 
35 #include <boost/version.hpp>
36 #if BOOST_VERSION >= 103800
37 #include <boost/spirit/include/classic.hpp>
38 #include <boost/spirit/include/phoenix1.hpp>
39 #define NS_BOOSTSPIRIT boost::spirit::classic
40 #else
41 #include <boost/spirit.hpp>
42 #include <boost/spirit/phoenix.hpp>
43 #define NS_BOOSTSPIRIT boost::spirit
44 #endif
45 
46 #include "ggl/chem/Molecule.hh"
47 #include "ggl/chem/MoleculeUtil.hh"
48 
49 namespace ggl {
50  namespace chem {
51 
52 
53  /*! @brief SMILES molecule parser
54  *
55  * This class defines the rules of the Daylight's (tm) SMILES
56  * BNF grammar. It allows for the parsing of a SMILES string to generate
57  * a molecule graph of the encodes molecule. The graph is represented as
58  * a boost graph (Molecule) and the atom and bond labels will be
59  * stored in the property_maps of the given PropNodeLabel and
60  * PropEdgeLabel.
61  *
62  * BNF grammar of Daylight's SMILES
63  *
64  * smiles ::= chain (chain | branch)*
65  * chain ::= bond? (simple_atom | complex_atom) ringclosure*
66  * branch ::= '(' chain (chain | branch)* ')'
67  * ringclosure ::= digit | ('%' digit digit)
68  * simple_atom ::= simple_symbol
69  * complex_atom ::= '[' isotope? (simple_symbol | complex_symbol | group_symbol) chirality? hcount? charge? name? ']'
70  * isotope ::= integer
71  * simple_symbol ::= 'Br' | 'Cl' | 'B' | 'c' | etc.
72  * complex_symbol ::= 's' | 'p' | 'o' | 'Zn' | etc.
73  * group_symbol ::= '{' anyChar+ '}'
74  * chirality ::= '@' '@'?
75  * hcount ::= 'H' integer?
76  * charge ::= '+' ('+'* | integer)
77  * | '-' ('-'* | integer)
78  * name ::= ':' integer
79  * bond ::= bond_symbol
80  * bond_symbol ::= '-' | '=' | '#' | ':' | etc.
81  * integer ::= digit+
82  * digit ::= [1-9]
83  *
84  * NOTE : chirality, and isotope information is currently ignored !
85  *
86  * NOTE : Supported atom labels are defined by MoleculeUtil::getAtomData().
87  *
88  * NOTE : Supported bond labels are defined by MoleculeUtil::getBondData().
89  *
90  * NOTE further : we allow for an extension of the SMILES encoding:
91  * complex atoms are allowed to hold a group_symbol ID strings
92  * enclosed in brackets of the form '{SOMEID}'. They are replaced by
93  * according group subgraphs if found in the provided group map.
94  * Otherwise the parsing is aborted.
95  *
96  * @author Christoph Flamm (c) 2008 http://www.tbi.univie.ac.at/~xtof/
97  * @author Martin Mann (c) 2008 http://www.bioinf.uni-freiburg.de/~mmann/
98  */
100  : public NS_BOOSTSPIRIT::grammar< SMILES_grammar >
101  {
102  protected:
103 
104  //! The boost graph object that is filled to represent the next parsed
105  //! SMILES string.
107 
108  //! Container that holds group IDs where each matching node has to be
109  //! replaced by the according mapped subgraph
110  const GroupMap * groups;
111 
112  //! Access to the node label property_map of g2fill to set atom labels
113  mutable
114  boost::property_map<Molecule, PropNodeLabel>::type
116 
117  //! Access to the edge label property_map of g2fill to set bond labels
118  mutable
119  boost::property_map<Molecule, PropEdgeLabel>::type
121 
122  //! Adds an atom to the internal molecule graph to fill
123  //! @param label the atom label to set
124  void
125  addAtom( const std::string& label ) const;
126 
127  //! Adds a bond to the internal molecule graph to fill
128  //! @param atom1 the first bond partner
129  //! @param atom2 the second bond partner
130  //! @param label the bond label to set
131  void
132  addBond( const int atom1, const int atom2, const std::string& label ) const;
133 
134  public:
135 
136  //! Constructs the definitions of a Daylight's SMILES grammar to parse
137  //! a SMILES string and to fill the encoded molecule into a given
138  //! boost graph object.
139  //! @param toFill the boost graph object to add nodes and edges to
140  explicit SMILES_grammar( Molecule& toFill );
141 
142  //! Constructs the definitions of a Daylight's SMILES grammar to parse
143  //! a SMILES string and to fill the encoded molecule into a given
144  //! boost graph object.
145  //! @param toFill the boost graph object to add nodes and edges to
146  //! @param groups a container that holds group IDs where
147  //! each matching node has to be replaced by the according
148  //! mapped subgraph
149  explicit SMILES_grammar( Molecule& toFill
150  , const GroupMap & groups );
151 
152  //! Parses a SMILES string and generates a graph representation of the
153  //! molecule
154  //! @param SMILES_string the string to parse
155  //! @return pair.first = the graph encoding of the molecule
156  //! pair.second = -1 if parsing was successful,
157  //! in error case it returns the string position that caused
158  //! the parsing error
159  //! @throw std::invalid_argument in case a check fails
160  static
161  std::pair< Molecule, int >
162  parseSMILES( const std::string & SMILES_string )
163  throw (std::invalid_argument);
164 
165  //! Parses a SMILES string and generates a graph representation of the
166  //! molecule
167  //! @param SMILES_string the string to parse
168  //! @param groups a container that holds group IDs where
169  //! each matching node has to be replaced by the according
170  //! mapped subgraph
171  //! @return pair.first = the graph encoding of the molecule
172  //! pair.second = -1 if parsing was successful,
173  //! in error case it returns the string position that caused
174  //! the parsing error
175  //! @throw std::invalid_argument in case a check fails
176  static
177  std::pair< Molecule, int >
178  parseSMILES( const std::string & SMILES_string, const GroupMap & groups )
179  throw (std::invalid_argument);
180 
181 
182  //! The definition of the SMILES grammar.
183  template <typename ScannerT>
184  struct definition
185  {
186  public:
187 
188  //! Construction of the SMILES BNF grammar rules
189  definition( SMILES_grammar const& self);
190 
191  //! start parsing
192  NS_BOOSTSPIRIT::rule<ScannerT> const&
193  start() const;
194 
195  protected:
196 
197  char atom2_tmp;
198 
199  /*!
200  * Dedicated parser for atom labels that can come as simple
201  * symbols and comprise only one characters.
202  */
203  struct simpleSymbol_parser : public NS_BOOSTSPIRIT::char_parser<simpleSymbol_parser>
204  {
206 
207  const std::string simpleSymbols;
208 
209  //! construction
211  : simpleSymbols("BCNOPSFIHsponcb")
212  {}
213 
214  //! tests whether or not the parsed character is a
215  //! valid and supported atom label
216  //! @param ch the parsed character to test
217  //! @return whether or not @a ch is a valid and supported
218  //! atom label
219  template <typename CharT>
220  bool test(CharT ch) const
221  {
222  return simpleSymbols.find(ch)
223  != std::string::npos;
224  }
225  };
226 
228 
229  /*!
230  * Dedicated parser for atom labels comprising only one characters.
231  */
232  struct atom1_parser : public NS_BOOSTSPIRIT::char_parser<atom1_parser>
233  {
235 
236  //! construction
238 
239  //! tests whether or not the parsed character is a
240  //! valid and supported atom label
241  //! @param ch the parsed character to test
242  //! @return whether or not @a ch is a valid and supported
243  //! atom label
244  template <typename CharT>
245  bool test(CharT ch) const
246  {
247  return MoleculeUtil::getAtomData().find(std::string(1,ch))
248  != MoleculeUtil::getAtomData().end();
249  }
250  };
251 
253 
254  //! first character of a two letter atom label
256 
257  /*!
258  * Dedicated parser for atom labels comprising only one characters.
259  */
260  struct atom2_parser : public NS_BOOSTSPIRIT::char_parser<atom2_parser>
261  {
263 
264  const char* const firstChar;
265  mutable std::string label;
266 
267  //! construction
268  atom2_parser(char * firstChar_) : firstChar(firstChar_), label(" ")
269  {}
270 
271  //! tests whether or not the parsed character is a
272  //! valid and supported atom label
273  //! @param ch the parsed character to test
274  //! @return whether or not @a ch is a valid and supported
275  //! atom label
276  template <typename CharT>
277  bool test(CharT ch) const
278  {
279  // set label
280  label[0] = *firstChar;
281  label[1] = ch;
282  // check if two letter atom label is known
283  return MoleculeUtil::getAtomData().find(label)
284  != MoleculeUtil::getAtomData().end();
285  }
286  };
287 
288 
289  /*!
290  * Dedicated parser for bond labels.
291  */
292  struct bond_parser : public NS_BOOSTSPIRIT::char_parser<bond_parser>
293  {
295 
296  //! construction
298 
299  //! tests whether or not the parsed character is a valid and
300  //! supported bond label
301  //! @param ch the parsed character to test
302  //! @return whether or not @a ch is a valid bond label
303  template <typename CharT>
304  bool test(CharT ch) const
305  {
306  return MoleculeUtil::getBondData().find(std::string(1,ch))
307  != MoleculeUtil::getBondData().end();
308  }
309  };
310 
312 
313 
314 
315  ////////////// TYPEDEFs //////////////
316 
317 
318  //! Utility helper class for parsing.
319  class Atom_closure : public NS_BOOSTSPIRIT::closure< Atom_closure
320  , std::string
321  , std::string
322  , int
323  , int
324  , int >
325  {
326  public:
327  typedef NS_BOOSTSPIRIT::closure< Atom_closure
328  , std::string
329  , std::string
330  , int
331  , int
332  , int > SuperClass;
333 
334  //! bond label
335  typename SuperClass::member1 blabel;
336  //! atom label
337  typename SuperClass::member2 alabel;
338  //! atom index
339  typename SuperClass::member3 cnt;
340  //! ring closure index
341  typename SuperClass::member4 rc;
342  //! explicit H-atom count
343  typename SuperClass::member5 hcnt;
344  };
345 
346  //! type of rule context of this class
347  typedef NS_BOOSTSPIRIT::rule< ScannerT
348  , typename Atom_closure::context_t
350 
351  // Typedefs for Local Data Structures
352  typedef std::pair<int,int> bond_t;
353  typedef std::vector<std::pair<bond_t,std::string> > bonds_t;
354  struct AtomInfo {
355  std::string label;
356  int atomID;
358  AtomInfo( const std::string label, const int atomID, const bool isAromatic )
359  : label(label), atomID(atomID), isAromatic(isAromatic)
360  {}
361  };
362  typedef std::vector< AtomInfo > atoms_t;
363  typedef std::vector<int> stack_t;
364  typedef std::vector<std::pair<int,int> > hcount_t;
365 #if HAVE_UNORDERED_MAP > 0
366  typedef std::unordered_map<int,int> rcs_t;
367  typedef std::unordered_map<int,std::string> rcb_t;
368 #elif HAVE_TR1_UNORDERED_MAP > 0
369  typedef std::tr1::unordered_map<int,int> rcs_t;
370  typedef std::tr1::unordered_map<int,std::string> rcb_t;
371 #elif HAVE_GNU_HASH_MAP > 0
372  class hash_int {
373  public:
374 
375  size_t operator()(const int& v) const
376  {
377  return (size_t)v;
378  }
379 
380  };
381  typedef __gnu_cxx::hash_map<int,int,hash_int> rcs_t;
382  typedef __gnu_cxx::hash_map<int,std::string,hash_int> rcb_t;
383 #else
384  typedef std::map<int,int> rcs_t;
385  typedef std::map<int,std::string> rcb_t;
386 #endif
387 
388 
389  ////////////// VARIABLES //////////////
390 
391  //! back reference to enclosing object for molecule creation
392  SMILES_grammar const& self;
393 
394  // rules
396 
397  NS_BOOSTSPIRIT::rule<ScannerT> smiles;
398  NS_BOOSTSPIRIT::rule<ScannerT> simple_symbol, complex_symbol, bond_symbol, group_symbol;
399  NS_BOOSTSPIRIT::rule<ScannerT> isotope, charge, chirality, hcount, name;
400 
401 
402  // Local Data Structures
404  //! map of ring closure numbers to opening atom
406  //! map of ring closure numbers to bond label of closing bond
412 
413  ////////////// METHODS //////////////
414 
415  //! Function called by the parser to report the next atom.
416  //! @param atom the atom to add
417  //! @param alabel the label of the atom
418  //! @param blabel the label of the bond to the last reported atom
419  //! @throw std::invalid_argument in case a check fails
420  void
421  memorize_atom( int atom, std::string& alabel, std::string& blabel )
422  throw (std::invalid_argument);
423 
424  //! Function called by the parser to report the explicit hydrogen
425  //! count for complex atoms
426  //! @param atom the complex atom with explicit hydrogens
427  //! @param hcount the number of explicit hydrogens
428  void
429  memorize_explicit_H( int atom, int hcount);
430 
431  //! Function called by the parser to report a ring closure.
432  //! @param rc the ring closure index that was closed
433  //! @param atom1 the first atom of the ring
434  //! @param blabel the optional given label of the ring closing bond
435  //! @throw std::invalid_argument in case a check fails
436  void
437  memorize_rc( int rc, int atom1, std::string blabel )
438  throw (std::invalid_argument);
439 
440  //! Function called by the parser to report the opening of a
441  //! molecule branching
442  //! @param atom the atom id where the branching occured
443  void
444  open_branch( int atom );
445 
446  //! Function called by the parser to report that the last
447  //! molecule branching is ended
448  void
449  close_branch( void );
450 
451  //! Resets the local data structures to allow for the parsing of
452  //! the next SMILES string
453  void
454  reset_data_structures( void );
455 
456  //! Function called by the parser that reports warnings
457  //! to the standard error handle
458  void
459  parser_warning( std::string msg );
460 
461  //! Function called by the parser to add explicit hydrogens
462  //! from complex atoms to constructed graph
463  void
465 
466  }; // struct definition
467 
468 
469  }; // class SMILES_grammar
470 
471 
472 
473  } // namespace chem
474 } // namespace ggl
475 
476  // function implementations
477 #include "ggl/chem/SMILES_grammar.icc"
478 
479 
480 #endif /*SMILES_GRAMMAR_HH_*/