Generated on Wed Apr 29 2015 11:51:40 for GGL-4.1.2 by doxygen 1.8.3.1
SMILESwriter.hh
Go to the documentation of this file.
1 #ifndef GGL_CHEM_SMILESWRITER_HH_
2 #define GGL_CHEM_SMILESWRITER_HH_
3 
4 #include "ggl/chem/Molecule.hh"
6 
7 #include <map>
8 
9 namespace ggl {
10  namespace chem {
11 
12 
13 
14  /*! @brief Molecule to SMILES writer
15  *
16  * Utility class to generate a canonical SMILES string from a molecule
17  * graph representation. It expects atom (node) and edge (bond) label
18  * following the Daylight's SMILES description. See
19  * ggl::chem::SMILES_grammar for further details.
20  *
21  * It implements the algorithm suggested by Weininger (1989)
22  *
23  * @article{ Weininger_1989,
24  * title={SMILES. 2. Algorithm for generation of unique SMILES notation},
25  * author={Weininger, D and Weininger, A and Weininger, J L},
26  * journal={Journal of Chemical Information and Modeling},
27  * volume={29},
28  * number={2},
29  * pages={97--101},
30  * year={1989},
31  * publisher={American Chemical Society},
32  * url={http://dx.doi.org/10.1021/ci00062a008}
33  * }
34  *
35  * NOTE : THIS WRITER IS INCOMPLETE, I.E. NOT ALL TYPES OF MOLECULE ATOMS
36  * AND BONDS ARE HANDLED BY THIS WRITER !!!
37  *
38  * Supported atom labels are defined by MoleculeUtil::getAtomData().
39  *
40  * Supported bond labels are defined by MoleculeUtil::getBondData().
41  *
42  * @author Alexander Ullrich (c) 2008
43  * @author Martin Mann (c) 2008 http://www.bioinf.uni-freiburg.de/~mmann/
44  *
45  */
47  {
48  protected:
49 
50  // vertex and edge descriptor types
51  typedef boost::graph_traits<Molecule>::vertex_descriptor MVertex_t;
52  typedef boost::graph_traits<Molecule>::edge_descriptor MEdge_t;
53  // vertex and edge iterator types
54  typedef boost::graph_traits<Molecule>::vertex_iterator MV_Iterator_t;
55  typedef boost::graph_traits<Molecule>::adjacency_iterator MA_Iterator_t;
56 
57  // property map types
58  typedef boost::property_map<Molecule, PropNodeIndex>::const_type MV_Index_Map_t;
59  typedef boost::property_map<Molecule, PropNodeLabel>::const_type MV_Property_Map_t;
60  typedef boost::property_map<Molecule, PropEdgeLabel>::const_type ME_Property_Map_t;
61 
62  //! utility map type for DFS traversal of the Molecule graph
63  typedef std::map<MVertex_t, bool> MV_Visited_Map_t;
64 
65  /*! Container that holds all data necessary to calculate the invariant
66  * of a molecule atom node.
67  */
68  class NodeData {
69  public:
78  //! default construction
80  : valence(0), atomicNumber(0), connect(0), nonhydro(0), sign(0), charge(0), protons(0), isAromatic(0)
81  {}
82  /*! construction from atom label
83  * @param atomLabel the atom label that defines the initial data
84  * @param allowWildcard whether or not a wildcard is an allowed
85  * atom label
86  * @throws std::runtime_error if no atom data can be accessed for the atom label
87  */
88  NodeData( const std::string & atomLabel
89  , const bool allowWildcard );
90 
91  //! calculates the node invariant based on the current data
92  //! @return the invariant of this molecule atom
93  int
94  getInvariant() const {
95  return 10000000*connect
96  + 100000*nonhydro
97  + 1000*atomicNumber
98  + 100*sign
99  + 10*charge
100  + protons;
101  }
102 
103  };
104 
105 
106  typedef std::map<MVertex_t, NodeData> MV_NodeData_Map;
107 
108  static std::set<std::string> organic_subset;
109 
110  public:
111 
112  SMILESwriter();
113 
114  /*! Generates a canonical SMILES string of the given graph
115  * representation of a molecule.
116  *
117  * NOTE: THE FUNCTIONALITY IS INCOMPLETE, i.e. not all atom and
118  * bond types are possible! See class description!
119  *
120  * @param m the molecule graph to parse
121  * @param ignoreProtons if true and @a m is no HH molecule,
122  * all protons that can be inferred from atom and bond valence
123  * are ignored when producing the SMILES string;
124  * otherwise protons are compressed
125  * into the adjacent non-proton atom label
126  * @param allowWildcard whether or not the wildcard is a valid atom
127  * label within SMILES. NOTE: this will result in non-standard
128  * SMILES since the wildcard label is a non-standard extension
129  * within the GGL!
130  * @return a canonical SMILES string representing the given molecule
131  * @throws std::runtime_error if unsupported atom or bond labels are
132  * encountered
133  */
134  static
135  std::string
136  getSMILES( const Molecule& m
137  , const bool ignoreProtons = true
138  , const bool allowWildcard = false );
139 
140  /*! Generates a canonical SMILES string of the given graph
141  * representation of a molecule.
142  *
143  * Before this is done, all known groups are compressed into their
144  * according group labels using MoleculeUtil::compressGroups(..).
145  *
146  * NOTE: THE FUNCTIONALITY IS INCOMPLETE, i.e. not all atom and
147  * bond types are possible! See class description!
148  *
149  * @param m the molecule graph to parse
150  * @param groups a container that holds group IDs where
151  * each matching node represents the according subgraph
152  * @param ignoreProtons if true and @a m is no HH molecule,
153  * all protons that can be inferred from atom and bond valence
154  * are ignored when producing the SMILES string;
155  * otherwise protons are compressed
156  * into the adjacent non-proton atom label
157  * @param allowWildcard whether or not the wildcard is a valid atom
158  * label within SMILES. NOTE: this will result in non-standard
159  * SMILES since the wildcard label is a non-standard extension
160  * within the GGL!
161  * @return a canonical SMILES string representing the given molecule
162  */
163  static
164  std::string
165  getSMILES( const Molecule& m
166  , const GroupMap & groups
167  , const bool ignoreProtons
168  , const bool allowWildcard = false );
169 
170  protected:
171 
172  /*! Generates a canonical SMILES string of the given graph
173  * representation of a molecule.
174  *
175  * NOTE: THE FUNCTIONALITY IS INCOMPLETE, i.e. not all atom and
176  * bond types are possible! See class description!
177  *
178  * @param m the molecule graph to parse
179  * @param groups a container that holds group IDs where
180  * each matching node represents the according subgraph; can be
181  * NULL if no groups are to be considered
182  * @param ignoreProtons if true and @a m is no HH molecule,
183  * all protons that can be inferred from atom and bond valence
184  * are ignored when producing the SMILES string;
185  * otherwise protons are compressed
186  * into the adjacent non-proton atom label
187  * @param allowWildcard whether or not the wildcard is a valid atom
188  * label within SMILES. NOTE: this will result in non-standard
189  * SMILES since the wildcard label is a non-standard extension
190  * within the GGL!
191  * @return a canonical SMILES string representing the given molecule
192  */
193  static
194  std::string
195  getSMILES( const Molecule& m
196  , const GroupMap * const groups
197  , const bool ignoreProtons
198  , const bool allowWildcard );
199 
200  static
201  const int primes[];
202 
203  static
204  const int primesLength;
205 
206  static
207  int
208  prime(int number);
209 
210  static
211  std::set<std::string>
213 
214  /*
215  * @param groups a container that holds group IDs where
216  * each matching node represents the according subgraph; can be
217  * NULL if no groups are to be considered
218  */
219  static
220  std::vector<int>
221  canonize(const Molecule *graph
222  , const MV_Visited_Map_t &visit
223  , const MV_Index_Map_t & idx
224  , const MV_Property_Map_t & vname
225  , const ME_Property_Map_t & ename
226  , MV_NodeData_Map & nodeData
227  , const GroupMap * const groups
228  , const bool allowWildcard
229  );
230 
231  static
232  std::string
234  , const Molecule *graph
235  , std::vector<int> *ranks
236  , MV_Visited_Map_t & visit
237  , const MV_NodeData_Map & nodeData
238  , const MV_Index_Map_t & idx
239  , const MV_Property_Map_t & vname
240  , const ME_Property_Map_t & ename
241  , const bool ignoreProtons
242  );
243 
244  static
245  std::string
246  second_pass(std::string smiles);
247 
248  /*!
249  * Checks whether or not an atom identifier has to be enclosed in
250  * brackets within the SMILES notation or not.
251  *
252  * @param atom the atom label without enclosing brackets
253  * @return true, if the atom label has to be enclosed with brackets;
254  * false otherwise.
255  */
256  static
257  bool
258  isWithBracketsInSMILES( const std::string& atom );
259 
260  /*!
261  * Produces a SMILES conform label of the given atom label, ie.
262  * enclosing brackets are added if needed.
263  * @param label the atom label to check
264  * @return the SMILES conform atom label
265  */
266  static
267  std::string
268  getLabel( const std::string& label );
269 
270  /*!
271  * Produces a SMILES conform label of the given atom label, ie.
272  * enclosing brackets are added if needed. Furthermore, protons can
273  * be removed from the SMILES if their number can be deduced from
274  * atom and bond valence.
275  * @param label the atom label to check
276  * @param nodeData the atom information for this atom
277  * @param ignoreProtons whether or not protons should be removed from
278  * the atom label if possible
279  * @return the SMILES conform atom label
280  */
281  static
282  std::string
283  getLabel( const std::string& label
284  , const NodeData& nodeData
285  , const bool ignoreProtons );
286 
287  };
288 
289 
290  } // namespace chem
291 } // namespace ggl
292 
293  // include method implementation
294 #include "ggl/chem/SMILESwriter.icc"
295 
296 #endif /*SMILESWRITER_HH_*/