-
Notifications
You must be signed in to change notification settings - Fork 23
NO_JIRA Adding two scripts from ccdc confidential for conformer analysis. Ori… #91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
+388
−0
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| # Conformer Filter Density | ||
|
|
||
| Filter conformers using a variety of metrics. See arguments in script for details. | ||
|
|
||
| CCDC Python API Licence required, minimum version: 3.0.15 | ||
189 changes: 189 additions & 0 deletions
189
scripts/conformer_filter_density/conformer_filter_density.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| #!/usr/bin/env python3 | ||
| # | ||
| # This script can be used for any purpose without limitation subject to the | ||
| # conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx | ||
| # | ||
| # This permission notice and the following statement of attribution must be | ||
| # included in all copies or substantial portions of this script. | ||
| # | ||
| # 2026-02-03: created by the Cambridge Crystallographic Data Centre | ||
|
|
||
| import argparse | ||
| import csv | ||
|
|
||
| from ccdc import conformer, io | ||
|
|
||
|
|
||
| def parse_args(): | ||
| """Parse command line arguments.""" | ||
| parser = argparse.ArgumentParser(description=__doc__) | ||
|
|
||
| parser.add_argument('inmolfn', | ||
| metavar='<input molecule file>', | ||
| help='Input file (single- or multi-molecule file)') | ||
|
|
||
| parser.add_argument('-m', '--mode', | ||
| choices=['absolute', 'relative'], | ||
| default='absolute', | ||
| help='Limit mode: absolute (fixed threshold) or relative ' | ||
| '(threshold based on molecule with fewest unusual torsions). ' | ||
| 'WARNING: Relative mode may behave unexpectedly with conformers ' | ||
| 'from multiple input molecules (default: %(default)s)') | ||
|
|
||
| parser.add_argument('-l', '--limit', | ||
| dest='torsion_limit', | ||
| type=int, | ||
| default=0, | ||
| metavar='<limit>', | ||
| help='Maximum number of unusual torsions for a passing molecule ' | ||
| '(default: %(default)s)') | ||
|
|
||
| parser.add_argument('-d', '--local-density', | ||
| dest='local_density_threshold', | ||
| type=float, | ||
| default=10.0, | ||
| metavar='<threshold>', | ||
| help='Local density threshold for classifying a torsion as unusual ' | ||
| '(default: %(default)s)') | ||
|
|
||
| parser.add_argument('--incl-organometallics', | ||
| dest='incl_organometallics', | ||
| action='store_true', | ||
| help='Include organometallic compounds in the search ' | ||
| '(default: organic compounds only)') | ||
|
|
||
| parser.add_argument('--generalisation', | ||
| action='store_true', | ||
| help='Turn on generalisation for searches') | ||
|
|
||
| parser.add_argument('--successfn', | ||
| default='successes.mol', | ||
| metavar='<file>', | ||
| help='Output file for molecules that pass the filter ' | ||
| '(default: %(default)s)') | ||
|
|
||
| parser.add_argument('--failurefn', | ||
| default='failures.mol', | ||
| metavar='<file>', | ||
| help='Output file for molecules that fail the filter ' | ||
| '(default: %(default)s)') | ||
|
|
||
| parser.add_argument('-u', '--unusual-torsions', | ||
| dest='unusualtorsionfn', | ||
| default='unusual_torsions.csv', | ||
| metavar='<file>', | ||
| help='Output CSV file for unusual torsion details ' | ||
| '(default: %(default)s)') | ||
|
|
||
| return parser.parse_args() | ||
|
|
||
|
|
||
| def create_mogul_engine(local_density_threshold, incl_organometallics, generalisation): | ||
| """Create and configure a geometry analyser engine. | ||
|
|
||
| Args: | ||
| local_density_threshold: Threshold for classifying torsions as unusual | ||
| incl_organometallics: Whether to include organometallic compounds | ||
| generalisation: Whether to enable generalisation for searches | ||
|
|
||
| Returns: | ||
| Configured ccdc.conformer.GeometryAnalyser instance | ||
| """ | ||
| engine = conformer.GeometryAnalyser() | ||
|
|
||
| engine.settings.bond.analyse = False | ||
| engine.settings.angle.analyse = False | ||
| engine.settings.ring.analyse = False | ||
|
|
||
| engine.settings.torsion.local_density_threshold = local_density_threshold | ||
| engine.settings.generalisation = generalisation | ||
| engine.settings.organometallic_filter = 'all' if incl_organometallics else 'organics_only' | ||
|
|
||
| return engine | ||
|
|
||
|
|
||
| def analysis(torsion_limit, input_filename, mode, engine, success_file, failure_file, unusual_torsion_file): | ||
| """Analyze molecules for unusual torsions and filter based on criteria. | ||
|
|
||
| Args: | ||
| torsion_limit: Maximum number of unusual torsions allowed | ||
| input_filename: Path to input molecule file | ||
| mode: 'absolute' or 'relative' filtering mode | ||
| engine: Configured GeometryAnalyser instance | ||
| success_file: Path to output file for passing molecules | ||
| failure_file: Path to output file for failing molecules | ||
| unusual_torsion_file: Path to CSV file for unusual torsion details | ||
| """ | ||
| # Analyze all molecules and collect unusual torsion data | ||
| molecules = [] | ||
| min_unusual_torsions = float('inf') | ||
|
|
||
| with io.MoleculeReader(input_filename) as mol_reader: | ||
| for molecule in mol_reader: | ||
| molecule.standardise_aromatic_bonds() | ||
| molecule.standardise_delocalised_bonds() | ||
|
|
||
| geometry_analysed_molecule = engine.analyse_molecule(molecule) | ||
|
|
||
| molecule.unusual_torsions = [ | ||
| t for t in geometry_analysed_molecule.analysed_torsions | ||
| if t.unusual and t.enough_hits | ||
| ] | ||
| molecule.num_unusual_torsions = len(molecule.unusual_torsions) | ||
| molecules.append(molecule) | ||
|
|
||
| min_unusual_torsions = min(min_unusual_torsions, molecule.num_unusual_torsions) | ||
|
|
||
| # Write results | ||
| with io.MoleculeWriter(success_file) as passed_writer, \ | ||
| io.MoleculeWriter(failure_file) as failed_writer, \ | ||
| open(unusual_torsion_file, 'w', newline='') as csv_file: | ||
|
|
||
| csv_writer = csv.writer(csv_file) | ||
| csv_writer.writerow(['MoleculeIndex', 'Value', 'Zscore', 'LocalDensity', 'NumHits', 'Atoms']) | ||
|
|
||
| for idx, molecule in enumerate(molecules): | ||
| threshold = torsion_limit if mode == 'absolute' else min_unusual_torsions + torsion_limit | ||
| failed = molecule.num_unusual_torsions > threshold | ||
|
|
||
| if failed: | ||
| failed_writer.write(molecule) | ||
| for torsion in molecule.unusual_torsions: | ||
| csv_writer.writerow([ | ||
| idx, | ||
| torsion.value, | ||
| torsion.z_score, | ||
| torsion.local_density, | ||
| torsion.nhits, | ||
| ' '.join(torsion.atom_labels) | ||
| ]) | ||
| else: | ||
| passed_writer.write(molecule) | ||
|
|
||
|
|
||
| def run(): | ||
| """Main entry point for the script.""" | ||
| args = parse_args() | ||
|
|
||
| if args.torsion_limit < 0: | ||
| raise ValueError('Torsion limit must be >= 0') | ||
|
|
||
| engine = create_mogul_engine( | ||
| args.local_density_threshold, | ||
| args.incl_organometallics, | ||
| args.generalisation | ||
| ) | ||
|
|
||
| analysis( | ||
| args.torsion_limit, | ||
| args.inmolfn, | ||
| args.mode, | ||
| engine, | ||
| args.successfn, | ||
| args.failurefn, | ||
| args.unusualtorsionsfn, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| run() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| # Filter Poses | ||
|
|
||
| This is a short script to filter molecular poses in a multi-molecule file based on the torsion probabilities. | ||
|
|
||
| CCDC Python API Licence required, minimum version: 3.0.15 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as previous comment please :) And can you add the scripts to the ReadMe in the script folder too please :) |
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| #!/usr/bin/env python3 | ||
| # | ||
| # This script can be used for any purpose without limitation subject to the | ||
| # conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx | ||
| # | ||
| # This permission notice and the following statement of attribution must be | ||
| # included in all copies or substantial portions of this script. | ||
| # | ||
| # 2025-02-03: created by the Cambridge Crystallographic Data Centre | ||
|
|
||
| import argparse | ||
| import copy | ||
| import csv | ||
| import math | ||
|
|
||
| from ccdc import conformer, io | ||
|
|
||
|
|
||
| def parse_args(): | ||
| """Parse command line arguments.""" | ||
| parser = argparse.ArgumentParser(description=__doc__) | ||
|
|
||
| parser.add_argument('conformer_file', | ||
| metavar='<input file>', | ||
| help='Input file (multi-molecule file)') | ||
|
|
||
| parser.add_argument('-csv', '--write-csv', | ||
| dest='write_csv', | ||
| action='store_true', | ||
| help='Write a csv file for all the analysed conformers.') | ||
|
|
||
| return parser.parse_args() | ||
|
|
||
|
|
||
| class ProbabilityScorer: | ||
| """ | ||
| Use the ConformerGenerator and GeometryAnalyser to score conformers based on their conformational | ||
| probabilities and unusual torsions. | ||
| """ | ||
| def __init__(self, user_conformer_generator_settings=None, skip_minimisation=True, | ||
| user_mogul_analysis_settings=None): | ||
|
|
||
| self._generator = self._create_conformer_generator(user_conformer_generator_settings, skip_minimisation) | ||
| self._mogul_analysis_engine = self._create_analyser(user_mogul_analysis_settings) | ||
|
|
||
| def _create_analyser(self, user_mogul_analyser_settings): | ||
| """Create a GeometryAnalyser engine to analyse the conformers.""" | ||
| engine = conformer.GeometryAnalyser() | ||
|
|
||
| # Tweak the 'default defaults' | ||
| # By default, in this use case, we do not want to use generalisation. | ||
| engine.settings.generalisation = False | ||
| # By default, only the organic subset, i.e. exclude organometallic | ||
| engine.settings.organometallic_filter = 'Organic' | ||
|
|
||
| # Ensure user settings are kept: | ||
| if user_mogul_analyser_settings is not None: | ||
| engine.settings = copy.copy(user_mogul_analyser_settings) | ||
|
|
||
| # Only analyse torsions: | ||
| engine.settings.bond.analyse = False | ||
| engine.settings.angle.analyse = False | ||
| engine.settings.ring.analyse = False | ||
|
|
||
| return engine | ||
|
|
||
| def _create_conformer_generator(self, user_generator_settings, skip_minimisation=True): | ||
| """Create a ConformerGenerator engine to generate conformers for the molecules.""" | ||
| settings = conformer.ConformerSettings() | ||
| if user_generator_settings is not None: | ||
| settings = copy.copy(user_generator_settings) | ||
|
|
||
| # Mandatory setting here: this must work in use_input_torsion_distributions mode. | ||
| settings.use_input_torsion_distributions = True | ||
|
|
||
| engine = conformer.ConformerGenerator(settings, skip_minimisation) | ||
| return engine | ||
|
|
||
| @staticmethod | ||
| def _combined_local_density(all_local_densities): | ||
| if all_local_densities: | ||
| return sum(all_local_densities) / len(all_local_densities) | ||
| return 100.0 | ||
|
|
||
| def probability_analysis(self, molecule, bad_normalised_score=1.0, bad_probability_score=0.0, | ||
| bad_rmsd_score=9999.9): | ||
|
|
||
| # Approximation excluding rings: | ||
| is_rigid = sum(bond.is_rotatable for bond in molecule.bonds) == 0 | ||
|
|
||
| conformers = self._generator.generate(molecule) | ||
|
|
||
| normalised_score = None | ||
| rmsd = None | ||
| ln_probability = None | ||
|
|
||
| if conformers: | ||
| # First conformer is the most likely, so return its score. | ||
| normalised_score = conformers[0].normalised_score | ||
| rmsd = conformers[0].rmsd() | ||
| ln_probability = conformers[0].probability | ||
|
|
||
| if is_rigid: | ||
| if ln_probability is None: | ||
| ln_probability = 0.0 | ||
| if normalised_score is None: | ||
| normalised_score = 0.0 | ||
|
|
||
| if normalised_score is None: | ||
| normalised_score = bad_normalised_score | ||
|
|
||
| if rmsd is None: | ||
| rmsd = bad_rmsd_score | ||
|
|
||
| probability = bad_probability_score | ||
| if ln_probability is not None: | ||
| probability = math.exp(ln_probability) | ||
|
|
||
| return normalised_score, probability, rmsd | ||
|
|
||
| def unusual_torsions_analysis(self, molecule, max_hist_size=15): | ||
| checked_mol = self._mogul_analysis_engine.analyse_molecule(molecule) | ||
| unusual_count = 0 | ||
| local_densities = [] | ||
| unusual_local_densities = [] | ||
| no_data_torsions = 0 | ||
| for tor in checked_mol.analysed_torsions: | ||
| hist_size = sum(tor.histogram()) | ||
| if hist_size > max_hist_size: | ||
| local_densities.append(tor.local_density) | ||
|
|
||
| if tor.unusual: | ||
| unusual_local_densities.append(tor.local_density) | ||
| unusual_count += 1 | ||
| else: | ||
| no_data_torsions += 1 | ||
|
|
||
| # Expected number of torsions within 10 degrees assuming even distribution of observations. | ||
| # Number of Mogul bins would be strictly better than assuming 18. | ||
| uniform_torsion_dist = 100 / 18 | ||
| local_densities.append(uniform_torsion_dist) | ||
| return unusual_count, self._combined_local_density(local_densities), self._combined_local_density( | ||
| unusual_local_densities), no_data_torsions | ||
|
|
||
| def process_molecule(self, molecule): | ||
| molecule.remove_unknown_atoms() | ||
| molecule.assign_bond_types() | ||
|
|
||
| unusual_count, avg_local_density, avg_unusual_local_density, no_data_torsions = self.unusual_torsions_analysis( | ||
| molecule) | ||
| normalised_score, probability, rmsd = self.probability_analysis(molecule) | ||
|
|
||
| return { | ||
| "identifier": molecule.identifier, | ||
| "number of unusual torsions": unusual_count, | ||
| "average local density": avg_local_density, | ||
| "average unusual local density": avg_unusual_local_density, | ||
| "number of torsions with no data": no_data_torsions, | ||
| "normalised probability score": normalised_score, | ||
| "probability score": probability, | ||
| "RMSD to input conformation": rmsd, | ||
| } | ||
|
|
||
|
|
||
| def run(): | ||
| args = parse_args() | ||
|
|
||
| p = ProbabilityScorer() | ||
|
|
||
| all_data = [] | ||
|
|
||
| with io.MoleculeReader(args.conformer_file) as reader: | ||
| for i, molecule in enumerate(reader): | ||
| data = p.process_molecule(molecule) | ||
| if i == 0: | ||
| print(", ".join(data.keys())) | ||
| print(", ".join(str(value) for value in data.values())) | ||
| all_data.append(data) | ||
|
|
||
| if args.write_csv: | ||
| keys = all_data[0].keys() | ||
| with open('filtered_poses_analysis.csv', 'w', newline='') as output_file: | ||
| dict_writer = csv.DictWriter(output_file, keys) | ||
| dict_writer.writeheader() | ||
| dict_writer.writerows(all_data) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| run() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please expan on this Readme to include the sections that is in the template https://github.com/ccdc-opensource/csd-python-api-scripts/tree/main/scripts/new_script_readme_template