Generated on Wed Apr 29 2015 11:51:40 for GGL-4.1.2 by doxygen 1.8.3.1
MoleculeUtil.hh
Go to the documentation of this file.
1 #ifndef GGL_CHEM_MOLECULEUTIL_HH_
2 #define GGL_CHEM_MOLECULEUTIL_HH_
3 
4 #include <map>
5 #include <string>
6 #include <iostream>
7 #include <stdexcept>
8 #include <algorithm>
9 
10 #include <sgm/HashMap.hh>
11 
12 #if HAVE_UNORDERED_MAP > 0
13  #include <unordered_map>
14 #elif HAVE_TR1_UNORDERED_MAP > 0
15  #include <tr1/unordered_map>
16 #elif HAVE_GNU_HASH_MAP > 0
17  #include <ext/hash_map>
18 #else
19  #include <map>
20 #endif
21 
22 #include "ggl/chem/Molecule.hh"
24 
25 namespace ggl {
26  namespace chem {
27 
28 
29  /*! Container for molecule group ID -> subgraph mappings
30  *
31  * Container to store the IDs of molecular groups that can be represented
32  * within molecules by a single node and that are replaced by according
33  * subgraphs (stored within this container).
34  */
35  typedef
36  #if HAVE_UNORDERED_MAP > 0
37  std::unordered_map<std::string, MoleculeComponent >
38  #elif HAVE_TR1_UNORDERED_MAP > 0
39  std::tr1::unordered_map<std::string, MoleculeComponent >
40  #elif HAVE_GNU_HASH_MAP > 0
41  __gnu_cxx::hash_map< std::string, MoleculeComponent, sgm::hash_string >
42  #else
43  std::map< std::string, MoleculeComponent >
44  #endif
46 
47  /*! @brief Molecule utility factory
48  *
49  * Utility class that contains certain data and members needed for
50  * Molecule evaluation and handling.
51  *
52  * @author Martin Mann (c) 2009 http://www.bioinf.uni-freiburg.de/~mmann/
53  */
55  {
56  public:
57 
58  //! typedef to represent one byte of information
59  typedef unsigned char OneByte;
60 
61  //! the wildcard character for atom labels valid in molecule
62  //! descriptions
63  static const std::string AtomLabelWildcard;
64 
65  /*! Data type that stores information connected to a certain atom label.
66  */
67  class AtomLabelData {
68 
69  protected:
70 
71  static std::vector<OneByte> shellSizeSums;
72 
73  public:
74 
75  typedef std::vector<OneByte> OneByteVec;
76 
77  //! atomic number
79  //! available standard number of valence electrons
81  //! is set to 1 if the label denotes a part of an aromatic ring,
82  //! 0 otherwise
84  //! mean atomic weight
85  double atomicWeight;
86  //! maximal number of protons attached
88  //! list of all possible numbers of "valences" to be considered for valence checks
90  //! if set to 1 (default) a valence check is done for this atom,
91  //! otherwise (0) no valence check is performed
93 
94  /*! Construction
95  * initializes the list of all possible valences with "valence"
96  *
97  * @param atomicNumber_ the atomic number of the atom
98  * @param valence_ the available number of valence electrons
99  * @param isAromatic_ set 1 if the atom is part of an aromatic
100  * ring; 0 otherwise
101  * @param atomicWeight_ the atomic weight of the atom
102  * @param isToBeChecked set 1 if a valence check is to be done
103  * for this type of atom, 0 otherwise.
104  */
105  AtomLabelData( const OneByte atomicNumber_
106  , const OneByte valence_
107  , const OneByte isAromatic_
108  , const double atomicWeight_
109  , const OneByte isToBeChecked_ = 1)
110  : atomicNumber(atomicNumber_)
111  , valence(valence_)
112  , isAromatic(isAromatic_)
113  , atomicWeight(atomicWeight_)
116  , isToBeChecked(isToBeChecked_)
117  {
118  // initialize list of all valences to be considered for checks
119  *(valenceAlternatives.begin()) = valence;
120  }
121 
122  /*! Construction
123  * @param atomicNumber_ the atomic number of the atom
124  * @param valence_ the number of valence electrons to set
125  * @param isAromatic_ set 1 if the atom is part of an aromatic
126  * ring; 0 otherwise
127  * @param atomicWeight_ the atomic weight of the atom
128  * @param valenceList list of all valences to be considered for checks.
129  * @param isToBeChecked set 1 if a valence check is to be done
130  * for this type of atom, 0 otherwise.
131  */
133  const OneByte atomicNumber_
134  , const OneByte valence_
135  , const OneByte isAromatic_
136  , const double atomicWeight_
137  , const OneByte isToBeChecked_
138  , const OneByteVec valenceList )
139  : atomicNumber(atomicNumber_)
140  , valence(valence_)
141  , isAromatic(isAromatic_)
142  , atomicWeight(atomicWeight_)
143  , valenceAlternatives(valenceList)
144  , isToBeChecked(isToBeChecked_)
145  {
146  // ensure that the standard valence is among the possible valence values
147  if( std::find(valenceAlternatives.begin(), valenceAlternatives.end(), valence) == valenceAlternatives.end() ) {
148  valenceAlternatives.push_back(valence);
149  }
150  // does not make sense to give alternatives and don't enable checking ...
151  assert( valenceAlternatives.size() == 1 || (int)isToBeChecked == 1 );
152  }
153 
154  };
155 
156  /*! Mapping of atom labels (in SMILES notation but without brackets)
157  * to the corresponding atom information like valence etc.
158  */
159  typedef std::map< std::string, AtomLabelData > AtomDataMap;
160 
161  /*! Mapping of aromatic labels to their non-aromatic form and vice
162  * versa, to enable a relabeling of aromatic rings.
163  */
164  typedef std::map< std::string, std::string > AromaticSwapMap;
165 
166  /*! Data type that stores information connected to a certain bond label.
167  */
169  public:
170  //! number of valence electrons of the bond
172  //! is set to 1 if the label denotes a part of an aromatic ring,
173  //! 0 otherwise
175 
176  /*! Construction
177  * @param valence_ the number of valence electrons to set
178  * @param isAromatic_ set 1 if the bond is part of an aromatic ring;
179  * 0 otherwise
180  */
181  BondLabelData( const OneByte valence_
182  , const OneByte isAromatic_ )
183  : valence(valence_)
184  , isAromatic(isAromatic_)
185  {}
186  };
187 
188  /*! Mapping of bond labels (in SMILES notation) to the corresponding
189  * bond information like valence etc.
190  */
191  typedef std::map< std::string, BondLabelData > BondDataMap;
192 
193 
194  public:
195 
196  //! consistency code : everything fine
197  static const size_t C_Consistent;
198  //! consistency code : at least one atom label is not SMILES conform
199  //! or currently not supported within the library
200  static const size_t C_AtomLabelInvalid;
201  //! consistency code : at least one bond label is not SMILES conform
202  //! or currently not supported within the library
203  static const size_t C_BondLabelInvalid;
204  //! consistency code : at least one atom label is a wildcard which is
205  //! currently not supported within the library
206  static const size_t C_AtomLabelWildcard;
207  //! consistency code : at least one atom label is complex and contains
208  //! implicit H atoms, this is currently not supported
209  static const size_t C_AtomComplexWithH;
210  //! consistency code : at least one atom shows an inconsistent
211  //! electron distribution, ie. valence+charge != protonCount
212  static const size_t C_AtomValence;
213  //! consistency code : at least one bond connects the same atom, i.e.
214  //! forms a loop
215  static const size_t C_BondLoop;
216  //! consistency code : the molecule graph is not connected, i.e.
217  //! contains at least two connected components
218  static const size_t C_NonConnected;
219 
220  public:
221 
222  //! Default construction
223  MoleculeUtil();
224  //! Default destruction
225  virtual ~MoleculeUtil();
226 
227  /////////////////// ATOM DATA ACCESS ETC. ////////////////////////
228 
229  /*! Access to the currently supported atom labels and the
230  * corresponding atom information, e.g. valence etc.
231  *
232  * @return the atom2data mapping
233  */
234  static
235  const AtomDataMap&
236  getAtomData( void );
237 
238  /*! Access to atom information for the given atom label, if existing.
239  *
240  * @param label the atom label to derive the information from
241  * @return the atom2data entry or NULL if none available
242  */
243  static
244  const AtomLabelData * const
245  getAtomData( const std::string& label );
246 
247 
248  /*!
249  * Access to the aromatic/non-aromatic pendant of an atom or edge
250  * label if it exists.
251  *
252  * @param label the label of interest
253  * @return a pointer to the pendant label or NULL if none exists
254  */
255  static
256  const std::string * const
257  getAromaticPendant( const std::string& label );
258 
259  /*! Access to atom name within a given atom label.
260  *
261  * @param label the atom label to derive the information from
262  * @return the name of the atom
263  */
264  static
265  std::string
266  getAtom( const std::string& label );
267 
268  /*! Access to number of protons within a given atom label,
269  * if existing.
270  *
271  * @param label the atom label to derive the information from
272  * @return the number of additional protons in the label
273  */
274  static
275  size_t
276  getProtons( const std::string& label );
277 
278  /*! Access to the charge within a given atom label,
279  * if existing.
280  *
281  * @param label the atom label to derive the information from
282  * @return the charge information in the label
283  */
284  static
285  int
286  getCharge( const std::string& label );
287 
288  /*! Access to the class information within a given atom label,
289  * if existing.
290  *
291  * @param label the atom label to derive the information from
292  * @return the class information in the label, or 0 if not present
293  */
294  static
295  int
296  getClass( const std::string& label );
297 
298 
299  /*!
300  * Produces a complex atom label with the given information.
301  *
302  * @param atom the atom label
303  * @param protons the number of protons attached to the atom;
304  * a value of 0 is ignored
305  * @param charge the charge of the atom; a value of 0 is ignored
306  * @param classID the classID; a value of 0 is ignored
307  * @param explicitChargeValue if true, a charge value of 1 is
308  * represented by "+1" rather than just "+"
309  *
310  * @return the according complex atom label
311  */
312  static
313  std::string
314  getComplexAtomLabel( const std::string& atom
315  , const size_t protons = 0
316  , const int charge = 0
317  , const int classID = 0
318  , const bool explicitChargeValue = false );
319 
320 
321  /////////////////// BOND DATA ACCESS ETC. ////////////////////////
322 
323  /*! Access to the currently supported bond labels and the
324  * corresponding bond information, e.g. valence etc.
325  *
326  * @return the bond2data mapping
327  */
328  static
329  const BondDataMap&
330  getBondData( void );
331 
332  /*! Access to bond information for the given bond label, if existing.
333  *
334  * @param label the bond label to derive the information from
335  * @return the bond2data entry or NULL if none available
336  */
337  static
338  const BondLabelData * const
339  getBondData( const std::string& label );
340 
341  /////////////////// CONSISTENCY CHECKS ETC. ////////////////////////
342 
343  /*! Checks if the given atom (node) label is SMILES conform and
344  * currently supported, e.g. by the SMILESwriter class.
345  *
346  * @param atomLabel the label to check
347  * @return true if the atom label is ok, false otherwise
348  */
349  static
350  bool
351  isValidAtomLabel( const std::string & atomLabel );
352 
353  /*! Checks if the given bond (edge) label is SMILES conform and
354  * currently supported, e.g. by the SMILESwriter class.
355  *
356  * @param bondLabel the label to check
357  * @return true if the bond label is ok, false otherwise
358  */
359  static
360  bool
361  isValidBondLabel( const std::string & bondLabel );
362 
363  /*! Checks if a given ggl::chem::Molecule graph is consistent.
364  * If not, according error codes are returned.
365  *
366  * @param mol the molecule graph to check
367  * @return if consistent C_Consistent is returned. Otherwise a product
368  * of the according C_* values.
369  */
370  static
371  size_t
372  isConsistent( const Molecule & mol );
373 
374  /*!
375  * Writes a description of the consistency status or errors, encoded
376  * in a consistency code produced by isConsistent*(...), to a given
377  * outstream. The function returns whether or not an error occured.
378  *
379  * @param consistencyCode the error code to parse, produced by
380  * a call to isConsistent*(...)
381  * @param errorStream the output stream to write the error description
382  * to
383  * @return true if no error is encoded; false otherwise
384  */
385  static
386  bool
387  decodeConsistencyStatus( const size_t consistencyCode
388  , std::ostream& errorStream ) ;
389 
390 
391  /////////////////// MOLECULE EDITING ETC. ////////////////////////
392 
393 
394  /*! Copies a given molecule into another molecule object which is
395  * overwritten.
396  *
397  * @param mol the molecule graph to copy
398  * @param toFill the molecule graph to make a copy of mol
399  */
400  static
401  void
402  copy( const Molecule& mol, Molecule & toFill);
403 
404 
405  /*! Copies a given molecule into another molecule object which is
406  * overwritten.
407  *
408  * Note, no sanity checks of node/edge labels, degrees, etc. are done!
409  *
410  * @param mol the molecule graph to copy
411  * @param toFill the molecule graph to make a copy of mol
412  */
413  static
414  void
415  copy( const sgm::Graph_Interface& mol, Molecule & toFill);
416 
417  /*! Compresses all explicitly represented "H" atoms into the adjacent
418  * atom label.
419  *
420  * Note: only protons with atom node label "H" are compressed. Nodes
421  * e.g. including class information like "H:1" are maintained and not
422  * removed.
423  *
424  * @param mol the molecule graph to compress
425  */
426  static
427  void
428  compressHnodes( Molecule& mol );
429 
430 
431  /*! Removes all represented "H" atoms that can be inferred from
432  * atom valence, charge and adjacent bond information. All other
433  * proton information that cannot be inferred is collapsed into
434  * accoring complex atom labels.
435  *
436  * Thus, in the end, the molecule won't show adjacent protons if any.
437  *
438  * Single protons or HH molecules are preserved as that.
439  *
440  * @param mol the molecule graph to be stripped
441  */
442  static
443  void
444  removeProtons( Molecule& mol );
445 
446 
447  /*!
448  * Computes the number of missing hydrogens to be added to this atom.
449  */
450  static
451  size_t
452  getProtonsToAdd( const std::string & atomLabel
453  , const int atomCharge
454  , const size_t bondValenceSum
455  , const size_t bondNum
456  , const size_t bondNumAromatic
457  , const size_t bondNumProton
458  );
459 
460 
461  /*! Adds explicit "H" atoms according to the valence of atom nodes
462  * and protons within complex atom labels.
463  *
464  * TODO: NOTE, this implementation is still experimental and is not
465  * necessarily correct!
466  *
467  * @param mol the molecule graph to fill
468  */
469  static
470  void
471  fillProtons( Molecule& mol );
472 
473 
474  /*!
475  * Converts a given molecule in Chemical Markup Language (CML) format.
476  *
477  * See http://cml.sourceforge.net/ for further details on the format.
478  *
479  * @param mol the molecule to represent
480  * @return the CML string representation of the molecule
481  */
482  static
483  std::string
484  convertCML( const Molecule& mol );
485 
486 
487  /*!
488  * Converts a given molecule in Chemical Markup Language (CML) format.
489  *
490  * See http://cml.sourceforge.net/ for further details on the format.
491  *
492  * @param mol the molecule to represent
493  * @param stream the out stream to add the CML representation to
494  * @return the altered stream filled with the CML string
495  * representation of the molecule
496  */
497  static
498  std::ostream &
499  convertCML( const Molecule& mol, std::ostream & stream );
500 
501 
502  /*!
503  * Replaces all nodes with group labels with the according molecule
504  * component if present in groups. If the group label is unknown, an
505  * exception is raised.
506  *
507  * NOTE: The node replacement is NOT recursive to avoid infinite
508  * replacement chains.
509  *
510  * @param mol the molecule to alter
511  * @param groups the list of known groups
512  *
513  * @thrown std::runtime_error in case a group label is unknown
514  */
515  static
516  void
518  , const GroupMap & groups
519  ) throw(std::runtime_error);
520 
521 
522  /*!
523  * Iteratively identifies groups within the molecule and replaces them
524  * by the according group label. The replacement is done in decreasing
525  * group size, ie. first the largest groups are compressed.
526  *
527  * @param mol the molecule to alter
528  * @param groups the list of known groups to be introduced if found
529  *
530  */
531  static
532  void
534  , const GroupMap & groups
535  );
536 
537  /*!
538  * Checks whether or not a given atom node label is a group label.
539  * @param nodeLabel the node label of interest
540  * @return true if the given node label is a group label; false
541  * otherwise
542  */
543  static
544  bool
545  isGroupLabel( const std::string & nodeLabel );
546 
547 
548  /*!
549  * Extracts the group identifier from the (complex) node label.
550  * @param nodeLabel the node label that contains a group label
551  * @return the group label or an empty string if no group label is
552  * present
553  */
554  static
555  std::string
556  getGroupLabel( const std::string & nodeLabel );
557 
558  protected:
559 
560  //! holds all digits for string parsing
561  static const std::string DIGIT;
562  //! holds all white space characters for string parsing
563  static const std::string WHITESPACE;
564 
565  //! the stored atom label to data mapping
567 
568  //! maps aromatic labels onto their non-aromatic form and vice versa
570 
571  //! the stored bond label to data mapping
573 
574 
575  /*! Checks if all node label of the given molecule are supported by
576  * the current SMILESwriter.
577  *
578  * @param mol the molecule to check
579  * @return true iff all node label are ok, false otherwise
580  */
581  static
582  bool
583  checkAtomLabel( const Molecule & mol );
584 
585  /*! Checks if any node label is equal to AtomLabelWildcard. Since this
586  * is currently not supported the check returns false if any occurence
587  * is found.
588  *
589  * @param mol the molecule to check
590  * @return true iff no atom label equals AtomLabelWildcard;
591  * false otherwise
592  */
593  static
594  bool
595  checkAtomLabelWildcard( const Molecule & mol );
596 
597  /*! Checks if any complex node label contains implicit H atoms.
598  * This is currently not supported by the library
599  *
600  * @param mol the molecule to check
601  * @return true iff no atom label equals AtomLabelWildcard;
602  * false otherwise
603  */
604  static
605  bool
606  checkAtomComplexWithH( const Molecule & mol );
607 
608  /*! Checks if any atom shows an inconsistent electron distribution,
609  * ie. [valence+charge != bondValenceSum+aromAdd] where aromAdd == 1
610  * if the atom is part of an aromatic ring and 0 otherwise.
611  *
612  * @param mol the molecule to check
613  * @return true iff no atom shows inconsistent electron distribution
614  * false otherwise
615  */
616  static
617  bool
618  checkAtomValence( const Molecule & mol );
619 
620  /*! Checks if all bond label of the given molecule are supported by
621  * the current SMILESwriter.
622  *
623  * @param mol the molecule to check
624  * @return true iff all bond labels are ok; false otherwise
625  */
626  static
627  bool
628  checkBondLabel( const Molecule & mol );
629 
630  /*! Checks if an atom forms a bond with itself, i.e. a ring bond.
631  *
632  * @param mol the molecule to check
633  * @return true iff all bond labels are ok; false otherwise
634  */
635  static
636  bool
637  checkBondLoop( const Molecule & mol );
638 
639  /*! Checks if the molecule shows more than one connected component.
640  *
641  * @param mol the molecule to check
642  * @return true iff only the molecule is connected; false otherwise
643  */
644  static
645  bool
646  checkNonConnected( const Molecule & mol );
647 
648 
649 
650  };
651 
652 
653 
654  } // namespace chem
655 } // namespace ggl
656 
657 #include "ggl/chem/MoleculeUtil.icc"
658 
659 #endif /*MOLECULEUTIL_HH_*/