pinder.data package#

Subpackages#

Submodules#

pinder.data.alignment_utils module#

class pinder.data.alignment_utils.DomainInfo(pdb: 'str', chain: 'str', ecod_domain_id: 'str', t_name: 'str')[source][source]#

Bases: object

pdb: str#

chain: str#

ecod_domain_id: str#

t_name: str#

class pinder.data.alignment_utils.Domain(pdb_id: 'str', chain: 'str', domain_id: 'str', t_name: 'str', pdb_from: 'int', pdb_to: 'int')[source][source]#

Bases: object

pdb_id: str#

chain: str#

domain_id: str#

t_name: str#

pdb_from: int#

pdb_to: int#

classmethod from_domain_info(domain_info: DomainInfo, start: int, end: int) → Domain[source]#

class pinder.data.alignment_utils.Alignment(pdbid1: 'str', pdbid2: 'str', alntmscore: 'float', qstart: 'int', qend: 'int', qlen: 'int', tstart: 'int', tend: 'int', tlen: 'int', alnlen: 'int')[source][source]#

Bases: object

pdbid1: str#

pdbid2: str#

alntmscore: float#

qstart: int#

qend: int#

qlen: int#

tstart: int#

tend: int#

tlen: int#

alnlen: int#

classmethod from_line(line: str) → Alignment[source]#

classmethod from_foldseek_line(line: str) → Alignment[source]#

classmethod from_ecod_info(ecod_info_pair: tuple[Domain, Domain]) → Alignment[source]#

indices1() → set[int][source]#

indices2() → set[int][source]#

flip_query_and_target() → Alignment[source]#

class pinder.data.alignment_utils.Interface(pdbid1: 'str', pdbid2: 'str', indices1: 'Set[int]', indices2: 'Set[int]', alignments1: 'Set[Tuple[str, float]]', alignments2: 'Set[Tuple[str, float]]')[source][source]#

Bases: object

pdbid1: str#

pdbid2: str#

indices1: Set[int]#

indices2: Set[int]#

alignments1: Set[Tuple[str, float]]#

alignments2: Set[Tuple[str, float]]#

classmethod from_line(line: str) → Interface[source]#

classmethod from_system(system: PinderSystem, radius: float) → Interface[source]#

classmethod from_contact_info(contact_info: dict[str, str | int | float | bool]) → Interface[source]#

flip_interface() → Interface[source]#

pinder.data.alignment_utils.get_foldseek_contacts(dimer: PinderSystem, radius: float = 10.0, backbone_definition: str = 'dockq', return_calpha_only: bool = True) → tuple[ndarray[Any, dtype[int64]], ndarray[Any, dtype[int64]]][source][source]#

pinder.data.alignment_utils.get_foldseek_numbering(arr: AtomArray) → dict[int, int][source][source]#

pinder.data.alignment_utils.get_foldseek_dimer_contacts(dimer_pdb: Path, contact_config: ContactConfig = ContactConfig(heavy_only=True, backbone_only=True, backbone_definition='dockq', radius=10.0, only_unique_resi=True, min_length=3)) → dict[str, str | int | float | bool] | None[source][source]#

pinder.data.alignment_utils.generate_dimer_foldseek_contacts(dimer_pdb: Path, contact_config: ContactConfig = ContactConfig(heavy_only=True, backbone_only=True, backbone_definition='dockq', radius=10.0, only_unique_resi=True, min_length=3), use_cache: bool = True) → tuple[dict[str, str | int | float | bool], Path] | None[source][source]#

pinder.data.alignment_utils.populate_foldseek_contacts(dimer_pdbs: list[Path], contact_config: ContactConfig = ContactConfig(heavy_only=True, backbone_only=True, backbone_definition='dockq', radius=10.0, only_unique_resi=True, min_length=3), use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

Process batch of Dimer PDBs to store contacts with different configurations for foldseek.

Parameters:

dimer_pdbslist[Path]: List of dimer PDBs to get contacts for.
use_cachebool: Whether to skip generation of contacts if the contacts.json corresponding to the config hash exists.
parallelbool: Whether to populate entries in parallel.
max_workersint, optional: Limit number of parallel processes spawned to max_workers.

pinder.data.alignment_utils.safe_read_contact_json(contact_json: Path) → dict[str, str | int | float | bool] | None[source][source]#

pinder.data.alignment_utils.load_contact_info(pdb_file: Path, contact_json: Path, config: ContactConfig) → dict[str, str | int | float | bool] | None[source][source]#

pinder.data.alignment_utils.collect_contact_jsons(data_dir: Path, dimer_pdbs: list[Path], config: ContactConfig, config_hash: str, use_cache: bool = True) → None[source][source]#

Return a mapping from monomer pairs to Interface objects

Collects json files storing foldseek-formatted contacts (assumed to consist of two monomers), creates Interface objects using these data, filters those Interfaces based on criteria, and constructs a map between pairs of monomer identifiers (str) and Interface objects that describe the interface between the monomer pair.

Any skipped systems are written to a log file.

Parameters:

data_dir: Path: Path to the data ingestion directory.
dimer_pdbs: list[Path]: List of dimer PDBs which should have contact jsons on disk.
config: ContactConfig: Config object used to determine the directory with a hash name based on config used for contacts.
config_hash: str: MD5 hash of the config object used.
use_cache: bool: Whether to skip creation of the interface dictionary if the output pickle file exists on disk.

Returns:

None: The interface dictionary is written to a pickle file called interfaces.pkl, with contents of type dict[tuple[str, str], Interface].

pinder.data.alignment_utils.load_interface_pkl(interface_pkl: Path) → dict[tuple[str, str], Interface][source][source]#

pinder.data.alignment_utils.write_interface_dict(interface_dict: dict[tuple[str, str], Interface], filepath: Path) → None[source][source]#

Write the interface dictionary to pkl file.

Parameters:

interface_dict: Dict[Tuple[str, str], Interface]: Dictionary mapping dimers (tuples of monomer IDs) to Interface objects
filepath: Path: The path to which to write “interfaces.pkl”

pinder.data.alignment_utils.get_interfaces_from_config(contact_root: Path, config_hash: str | None = None) → dict[tuple[str, str], Interface][source][source]#

pinder.data.apo_utils module#

pinder.data.apo_utils.sufficient_atom_types(struct: Structure, min_atom_types: int = 3) → bool[source][source]#

Checks if the given structure contains at least a minimum number of unique atom types.

Parameters:

struct (Structure) – The structure to be evaluated.
min_atom_types (int, optional) – The minimum number of unique atom types required. Default is 3.

Returns:

True if the structure contains at least the specified number of unique atom types, including ‘CA’.

Return type:

bool

pinder.data.apo_utils.sufficient_residues(struct: Structure, min_residues: int = 5) → bool[source][source]#

Determines if a structure contains at least a specified minimum number of residues.

Parameters:

struct (Structure) – The structure to be evaluated.
min_residues (int, optional) – The minimum number of residues required. Default is 5.

Returns:

True if the structure has at least the specified number of residues.

Return type:

bool

pinder.data.apo_utils.valid_structure(pdb_file: Path) → Structure | None[source][source]#

Attempts to create a Structure instance from a PDB file.

Parameters:: pdb_file (Path) – The path to the PDB file.
Returns:: The loaded Structure object if successful, None if an error occurs.
Return type:: Structure | None

pinder.data.apo_utils.validate_apo_monomer(apo_id: str, pdb_dir: Path, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))) → dict[str, str | bool][source][source]#

Validates an apo monomer by checking atom types and residue count against configuration thresholds.

Parameters:

apo_id (str) – The identifier for the apo monomer.
pdb_dir (Path) – The directory containing PDB files.
config (ApoPairingConfig, optional) – Configuration settings with thresholds for validation.

Returns:

A dictionary containing the monomer ID and its validation status.

Return type:

dict[str, str | bool]

pinder.data.apo_utils.holo_apo_seq_identity(holo_seq: str, apo_seq: str) → dict[str, str | float][source][source]#

Computes the sequence identity between a holo sequence and an apo sequence.

Parameters:

holo_seq (str) – The sequence of the holo structure.
apo_seq (str) – The sequence of the apo structure.

Returns:

A dictionary containing the holo sequence, apo sequence, and their sequence identity.

Return type:

dict[str, str | float]

pinder.data.apo_utils.chain_instance_from_chain(ch: str) → int[source][source]#

Extracts the instance/copy number from a chain identifier.

Parameters:: ch (str) – The chain identifier, which may contain digits representing the instance number.
Returns:: The extracted instance number, defaulting to 1 if no digits are found.
Return type:: int

pinder.data.apo_utils.remove_apo_chain_copies(monomer_df: DataFrame) → DataFrame[source][source]#

Removes duplicate chain entries from a DataFrame based on chain instance numbers.

Parameters:: monomer_df (pd.DataFrame) – The DataFrame containing monomer data with a ‘chain’ column.
Returns:: A DataFrame filtered to include only the first instance of each chain.
Return type:: pd.DataFrame

pinder.data.apo_utils.remove_dimer_chain_copies(dimer_df: DataFrame) → DataFrame[source][source]#

Filters out dimer chains that are duplicates based on their instance numbers.

Parameters:: dimer_df (pd.DataFrame) – A DataFrame containing data for dimers with columns for ‘chain_R’ and ‘chain_L’.
Returns:: The DataFrame filtered to exclude entries where both chains are copies.
Return type:: pd.DataFrame

pinder.data.apo_utils.hybrid_align(apo_monomer: Structure, holo_monomer: Structure, align_method: str = 'pymol') → tuple[Structure, dict[str, int | float]][source][source]#

Performs structural alignment between an apo monomer and a holo monomer using specified alignment methods.

The function supports alignment using either PyMOL or Biotite libraries, depending on the ‘align_method’ specified. The alignment results include the aligned structure and metrics such as RMSD and the number of aligned atoms.

Parameters:

apo_monomer (Structure) – The apo monomer structure to align.
holo_monomer (Structure) – The holo monomer structure as the reference.
align_method (str) – The alignment method to use; defaults to “pymol”. Options include “pymol” and “biotite”.

Returns:

A tuple containing the aligned apo monomer structure and: a dictionary with alignment metrics.

Return type:

tuple[Structure, dict[str, int | float]]

pinder.data.apo_utils.get_superimposed_metrics(holo_ref: Structure, apo_mono: Structure, body: str, unbound_id: str, holo_R: Structure, holo_L: Structure, rec_res: list[int], lig_res: list[int], bound_contacts: set[tuple[str, str, int, int]], holo2apo_seq: dict[str, dict[int, int]], apo2holo_seq: dict[str, dict[int, int]], config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))) → dict[str, str | float | int][source][source]#

Calculates and returns various metrics after superimposing the apo monomer onto the holo reference.

This function assesses interface contacts, sequence identity, and structural alignment quality between an apo and a holo structure, providing metrics that aid in evaluating the apo-holo pairing suitability.

Parameters:

holo_ref (Structure) – The holo reference structure used for alignment.
apo_mono (Structure) – The apo structure to align and analyze.
body (str) – Indicates whether the structure represents the ‘receptor’ or ‘ligand’ side.
unbound_id (str) – A unique identifier for the pairing, typically combining IDs of involved structures.
holo_R (Structure) – The holo structure of the receptor side.
holo_L (Structure) – The holo structure of the ligand side.
rec_res (list[int]) – List of receptor residues involved in holo interface contacts.
lig_res (list[int]) – List of ligand residues involved in holo interface contacts.
bound_contacts (set[tuple[str, str, int, int]]) – Set of tuples detailing contacts in the bound state.
holo2apo_seq (dict[str, dict[int, int]]) – Mapping of holo to apo sequences by residue numbers.
apo2holo_seq (dict[str, dict[int, int]]) – Mapping of apo to holo sequences by residue numbers.
config (ApoPairingConfig) – Configuration object with parameters like contact radius and alignment method.

Returns:

Dictionary of calculated metrics including interface residues, RMSD,: sequence identity, and alignment scores.

Return type:

dict[str, str | float | int]

pinder.data.apo_utils.get_sequence_based_metrics(apo_monomer_id: str, body: str, apo_complex: Structure, apo_R: Structure, apo_L: Structure, R_chain: str, L_chain: str, rec_res: list[int], lig_res: list[int], holo2apo_seq: dict[str, dict[int, int]]) → dict[str, str | float | int][source][source]#

Gathers sequence-based metrics for an apo monomer pairing based on sequence alignment and structural data. Metrics calculated here do not require any structural superposition.

Parameters:

apo_monomer_id (str) – Identifier for the apo monomer.
body (str) – Designates whether the monomer is treated as ‘receptor’ or ‘ligand’.
apo_complex (Structure) – Combined structure of apo monomer and holo counterpart body.
apo_R (Structure) – Structure of the apo monomer acting as the receptor.
apo_L (Structure) – Structure of the apo monomer acting as the ligand.
R_chain (str) – Chain identifier for the receptor.
L_chain (str) – Chain identifier for the ligand.
rec_res (list[int]) – List of holo receptor interface residues.
lig_res (list[int]) – List of holo ligand interface residues.
holo2apo_seq (dict[str, dict[int, int]]) – Mapping from holo to apo residues.

Returns:

Metrics related to sequence alignment and interface composition.

Return type:

dict[str, str | float | int]

pinder.data.apo_utils.get_unbound_id(holo_R: Structure, holo_L: Structure, apo_R: Structure, apo_L: Structure, body: str) → str[source][source]#

pinder.data.apo_utils.get_apo_pairing_metrics_for_id(df: DataFrame, pdb_dir: Path, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))) → DataFrame | None[source][source]#

Computes various structural and sequence-based metrics for apo-holo pairings for a given dataset of identifiers.

This function loads structures, performs alignments, and computes interface and sequence identity metrics for a set of potential apo-holo pairs specified in a DataFrame.

Parameters:

df (pd.DataFrame) – DataFrame containing identifiers and other data for apo-holo pairings.
pdb_dir (Path) – Path to the directory containing PDB files of the structures.
config (ApoPairingConfig) – Configuration object specifying parameters for alignment and analysis.

Returns:

DataFrame containing computed metrics for each pairing, or None if an error occurs.

Return type:

pd.DataFrame | None

pinder.data.apo_utils.calculate_frac_monomer_dimer_overlap(df: DataFrame, pdb_dir: Path, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))) → DataFrame[source][source]#

Calculates the fractional overlap of residues between apo monomers and their corresponding holo forms in the dimer. This method attempts to capture metrics for cases where a single apo monomer contains all or most of the full dimer complex, thereby making it impossible to predicted holo starting from apo.

Parameters:

df (pd.DataFrame) – DataFrame containing data for which overlap metrics need to be calculated.
pdb_dir (Path) – Directory where PDB files are located.
config (ApoPairingConfig) – Configuration parameters used in the calculation, such as sequence alignment settings.

Returns:

Updated DataFrame with calculated overlap metrics.

Return type:

pd.DataFrame

pinder.data.config module#

class pinder.data.config.PinderDataGenConfig(interacting_chains_backbone_only: bool = True, interacting_chains_radius: float = 10.0, connected_component_radius: float = 15.0, max_assembly_chains: int = 500)[source][source]#

Bases: object

A class to represent configuration parameters used to generate dataset.

Attributes:

interacting_chains_backbone_onlybool: Whether to define contacts between interacting chains based on backbone atoms.
interacting_chains_radiusfloat: The radius to use when detecting contacts between putative interacting chains.
connected_component_radiusfloat: The radius to use when calculating connected components.
max_assembly_chainsint: The maximum number of chains allowed in the bio-assembly to consider for ingestion.

interacting_chains_backbone_only: bool = True#

interacting_chains_radius: float = 10.0#

connected_component_radius: float = 15.0#

max_assembly_chains: int = 500#

class pinder.data.config.ContactConfig(heavy_only: bool = True, backbone_only: bool = True, backbone_definition: BackboneDefinition = 'dockq', radius: float = 10.0, only_unique_resi: bool = True, min_length: int = 3)[source][source]#

Bases: object

A class to represent configuration parameters used to generate foldseek contacts.

Attributes:

heavy_onlybool: Whether to limit contact search to heavy atoms.
backbone_onlybool: Whether to limit contact search to only backbone atoms for ingestion.
backbone_definition: BackboneDefinition: Which atoms names define backbone atoms. dockq: (CA, N, C, O) vs. biotite: (CA, N, C)
radiusfloat: The radius to use for detecting contacts between interacting chains.
only_unique_resibool: Whether to only return unique residue IDs making contacts.
min_lengthint: Minimum interface length per chain. Note: its not currently used when extracting the contacts.

heavy_only: bool = True#

backbone_only: bool = True#

backbone_definition: BackboneDefinition = 'dockq'#

radius: float = 10.0#

only_unique_resi: bool = True#

min_length: int = 3#

class pinder.data.config.TransientInterfaceConfig(radius: float = 2.3, min_buried_sasa: float = 1000.0, disulfide_bond_distance: float = 2.05, disulfide_bond_distance_tol: float = 0.05, disulfide_bond_dihedral: float = 90.0, disulfide_bond_dihedral_tol: float = 10.0)[source][source]#

Bases: object

A class to represent configuration parameters used to annotate potentially transient interfaces.

Attributes:

radiusfloat: Radius used to detect inter-chain bonds like di-sulfide bonds that may be inducing/stabilizing the interface. Default is 2.3 Å.
min_buried_sasafloat: The minimum buried surface area to not be considered a potentially transient interface. Default is 1000.0 Å^2.
disulfide_bond_distancefloat: Bond distance used to detect potential disulfide bridges.
disulfide_bond_distance_tolfloat: Tolerance to pad bond distance threshold by when calculating distances.
disulfide_bond_dihedralfloat: Bond dihedral angle used to detect potential disulfide bridges.
disulfide_bond_dihedral_tolfloat: Tolerance to pad bond dihedral angle threshold by when calculating dihedrals.

radius: float = 2.3#

min_buried_sasa: float = 1000.0#

disulfide_bond_distance: float = 2.05#

disulfide_bond_distance_tol: float = 0.05#

disulfide_bond_dihedral: float = 90.0#

disulfide_bond_dihedral_tol: float = 10.0#

class pinder.data.config.FoldseekConfig(sensitivity: float = 11.0, evalue: float = 0.05, score_type: str = 'lddt', max_seqs: int = 1000, alignment_type: int = 2, alignment_filename: str = 'alignment.txt')[source][source]#

Bases: object

A class to represent configuration parameters used in foldseek search.

Attributes:

sensitivity: float

Adjust sensitivity to speed trade-off; lower is faster, higher more sensitive (1.0 faster; 4.0 fast; 7.5 sensitive; default 9.5; pinder default 11.0)

evalue: float

List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures. Pinder default is 0.05.

score_type: str

Alignment metric to use as primary score. Must be one of lddt, alntmscore. Default is lddt.

max_seqs: int

Maximum results per query sequence allowed to pass the prefilter (affects sensitivity). Default is 1000.

alignment_type: int

Which alignment type to use in generating alignments. Main options are

TMalign which is actually an optimized version of TM, Foldseek-TM
1. this option is global and slow
2. –alignment-type 1
3Di+AA Gotoh-Smith-Waterman, which is the default
1. this option is local and fast
2. –alignment-type 2

alignment_filename: str

Alignment output filename. Defaults to alignment.txt.

sensitivity: float = 11.0#

evalue: float = 0.05#

score_type: str = 'lddt'#

max_seqs: int = 1000#

alignment_type: int = 2#

alignment_filename: str = 'alignment.txt'#

class pinder.data.config.MMSeqsConfig(sensitivity: float = 11.0, evalue: float = 0.05, score_type: str = 'pident', min_seq_id: float = 0.2, max_seqs: int = 1000, alignment_filename: str = 'alignment.txt')[source][source]#

Bases: object

A class to represent configuration parameters used in MMSeqs2 search.

Attributes:

sensitivity: float: Adjust sensitivity to speed trade-off; lower is faster, higher more sensitive Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive [5.700 default in mmseqs, 11.0 default in pinder.]
evalue: float: List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures. Pinder default is 0.05.
score_type: str: Alignment metric to use as primary MMSeqs2 score. Currently only pident is allowed.
min_seq_id: float: List matches above this sequence identity (for clustering) (range 0.0-1.0). Default is 0.2.
max_seqs: int: Maximum results per query sequence allowed to pass the prefilter (affects sensitivity). Default is 1000.
alignment_filename: str: Alignment output filename. Defaults to alignment.txt.

sensitivity: float = 11.0#

evalue: float = 0.05#

score_type: str = 'pident'#

min_seq_id: float = 0.2#

max_seqs: int = 1000#

alignment_filename: str = 'alignment.txt'#

class pinder.data.config.GraphConfig(min_interface_length: int = 7, min_alignment_length: int = 10, score_threshold: float = 0.5, upper_threshold: float = 1.1, mmseqs_score_threshold: float = 30.0, mmseqs_upper_threshold: float = 110.0, coverage_threshold: float = 0.5)[source][source]#

Bases: object

A class to represent configuration parameters used in constructing graphs from alignments.

Attributes:

min_interface_length: int: Minimum length of interface for clustering. Default is 7.
min_alignment_length: int: Minimum length of alignment for clustering Default is 10
score_threshold: float: Score threshold for clustering Default is 0.5
upper_threshold: float: Upper score threshold for clustering. Default is 1.1
mmseqs_score_threshold: float: MMSeqs2 score threshold for clustering Default is 30.
mmseqs_upper_threshold: float: Upper score threshold for MMSeqs2 clustering. Default is 110.
coverage_threshold: float: Coverage threshold for clustering Default is 0.5

min_interface_length: int = 7#

min_alignment_length: int = 10#

score_threshold: float = 0.5#

upper_threshold: float = 1.1#

mmseqs_score_threshold: float = 30.0#

mmseqs_upper_threshold: float = 110.0#

coverage_threshold: float = 0.5#

class pinder.data.config.ScatterConfig(two_char_batch_size: int = 2, mmcif_batch_size: int = 250, graphql_batch_size: int = 50000, dimer_batch_size: int = 5000, predicted_batch_size: int = 20000, foldseek_db_size: int = 50000, apo_pairing_id_batch_size: int = 20000)[source][source]#

Bases: object

A class to represent batching parameters used to scatter data pipeline tasks.

Attributes:

two_char_batch_size: int: Target number of two_char_codes per task batch.
mmcif_batch_size: int: Target number of raw mmcif files to ingest per task batch.
graphql_batch_size: int: Target number of PDB IDs per graphql task batch.
dimer_batch_size: int: Target number of dimer PDB files to annotate per task batch.
predicted_batch_size: int: Target number of pdb entries per predicted monomer population task.
foldseek_db_size: int: Target number of PDB file per sub-database to run all-vs-all foldseek on.
apo_pairing_id_batch_size: int: Target number of holo-apo-R/L pairing IDs per apo eval task batch.

two_char_batch_size: int = 2#

mmcif_batch_size: int = 250#

graphql_batch_size: int = 50000#

dimer_batch_size: int = 5000#

predicted_batch_size: int = 20000#

foldseek_db_size: int = 50000#

apo_pairing_id_batch_size: int = 20000#

class pinder.data.config.ClusterConfig(seed: int = 40, canonical_method: str = 'foldseek_community', edge_weight: str | None = 'weight', foldseek_cluster_edge_threshold: float = 0.7, foldseek_edge_threshold: float = 0.55, foldseek_af2_difficulty_threshold: float = 0.7, mmseqs_edge_threshold: float = 0.0, resolution_thr: float = 3.5, min_chain_length: int = 40, min_atom_types: int = 3, max_var_thr: float = 0.98, oligomeric_count: int = 2, method: str = 'X-RAY DIFFRACTION', interface_atom_gaps_4A: int = 0, prodigy_label: str = 'BIO', number_of_components: int = 1, alphafold_cutoff_date: str = '2021-10-01', depth_limit: int = 2, max_node_degree: int = 1000, top_n: int = 1, min_depth_2_hits_with_comm: int = 1, max_depth_2_hits_with_comm: int = 2000, max_depth_2_hits: int = 1000)[source][source]#

Bases: object

Configuration parameters for clustering pinder dimers and generating splits.

Attributes:

seed: int: Random seed to use for AsynLPA clustering.
canonical_method: str: Name of the “primary” clustering method. Default is foldseek_community.
edge_weight: str | None: The edge attribute for nx.Graph inputs representing the weight of an edge. If None, uses 1 for all weights. Used for AsynLPA clustering. Defaults to “weight”.
foldseek_cluster_edge_threshold: float: The edge weight threshold to use when clustering the foldseek graph. All edges below this threshold are removed from the graph. Defaults to 0.7.
foldseek_edge_threshold: float: The edge weight threshold to use when searching for neighboring nodes in the foldseek graph. Defaults to 0.55.
foldseek_af2_difficulty_threshold: float: The edge weight threshold to use when searching for neighboring nodes in the foldseek graph when establishing an alternative ‘difficulty’ level for the af2mm holdout set using a less strict threshold than the default threshold used for transitive hits deleaking. Defaults to 0.70.
mmseqs_edge_threshold: float: The edge weight threshold to use when searching for neighboring nodes in the mmseqs graph. Defaults to 0.0 (all alignment hits).
resolution_thr: float: Test set criteria: The maximum resolution threshold. Defaults to 3.5.
min_chain_length: int: Test set criteria: The minimum chain length threshold. Defaults to 40.
min_atom_types: int: Test set criteria: The minimum nubmer of atom types (currently tracked as number of elements). Defaults to 3.
max_var_thr: float: Test set criteria: The maximum variance threshold. Defaults to 0.98.
oligomeric_count: int: Test set criteria: oligomer count in the original RCSB entry. Defaults to 2 for dimers.
method: str: Test set criteria: experimental method used to generate structure. Defaults to X-RAY DIFFRACTION
interface_atom_gaps_4A: int: Test set criteria: maximum number of atom gaps within 4A of the interface residues. Defaults to 0.
prodigy_label: str: Test set criteria: the interaction type label as reported by prodigy_cryst. Defaults to BIO for biological interactions.
number_of_components: int: Test set criteria: maximum number of components in a chain (checks for detached components). Defaults to 0.
alphafold_cutoff_date: str: Test set criteria: The AF2 training cutoff date to use when constructing a holdout set for evaluating AF2-MM. Defaults to 2021-10-01.
depth_limit: int: Deleaking: maximum depth to hop between node neighborhoods when performing depth-first search on the graph for transitive hits. Default is 2.
max_node_degree: int: Deleaking: The maximum node degree at which we assume there is leakage when performing search for transitive hits. Defaults to 1_000.
top_n: int: Splitting: The maximum number of representatives per cluster ID. Defaults to 1.
min_depth_2_hits_with_comm: int: Splitting: The minimum number of depth_2 (or depth_limit) hits with community clustering. Defaults to 1.
max_depth_2_hits_with_comm: int: Splitting: The maximum number of depth_2 (or depth_limit) hits with community clustering. Defaults to 2_000.
max_depth_2_hits: int: Splitting: The maximum number of depth_2 (or depth_limit) hits. Defaults to 1_000.

seed: int = 40#

canonical_method: str = 'foldseek_community'#

edge_weight: str | None = 'weight'#

foldseek_cluster_edge_threshold: float = 0.7#

foldseek_edge_threshold: float = 0.55#

foldseek_af2_difficulty_threshold: float = 0.7#

mmseqs_edge_threshold: float = 0.0#

resolution_thr: float = 3.5#

min_chain_length: int = 40#

min_atom_types: int = 3#

max_var_thr: float = 0.98#

oligomeric_count: int = 2#

method: str = 'X-RAY DIFFRACTION'#

interface_atom_gaps_4A: int = 0#

prodigy_label: str = 'BIO'#

number_of_components: int = 1#

alphafold_cutoff_date: str = '2021-10-01'#

depth_limit: int = 2#

max_node_degree: int = 1000#

top_n: int = 1#

min_depth_2_hits_with_comm: int = 1#

max_depth_2_hits_with_comm: int = 2000#

max_depth_2_hits: int = 1000#

class pinder.data.config.ApoPairingConfig(apo_chain: str = 'A', contact_rad: float = 10.0, backbone_only: bool = False, heavy_only: bool = False, min_atom_types: int = 3, min_residues: int = 5, min_holo_resolved_frac: float = 0.3, align_method: str = 'pymol', max_refine_rmsd: float = 10.0, min_aligned_apo_res_frac: float = 0.7, min_seq_identity: float = 0.3, max_interface_miss_frac: float = 0.3, max_frac_monomer_dimer_sequence: float = 0.75, invalid_coverage_upper_bound: float = 2.0, invalid_coverage_lower_bound: float = 0.5, scaled_score_metrics: tuple[str, str, str, str, str] = ('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))[source][source]#

Bases: object

Configuration parameters for evaluating and selecting apo-holo pairs.

Attributes:

apo_chain: str: The apo structure chain name. Default is ‘A’ for all monomer structures.
contact_radfloat: The radius to use for detecting contacts between interacting chains in apo-holo evaluation.
backbone_onlybool: Whether to limit contact search to only backbone atoms.
heavy_onlybool: Whether to limit contact search to heavy atoms.
min_atom_types: int: Minimum number of unique atom types to consider a monomer for apo evaluation.
min_residues: int: Minimum number of monomer residues to consider for apo evaluation.
min_holo_resolved_frac: int: Limit apo pairing to those monomers which have at least this fraction of the holo monomer residues resolved. Note: this does not take into account sequence alignment or interface residues.
align_method: str: Alignment backend to use when superimposing apo monomers to their holo counterparts. Allowed values are pymol and biotite. Default is pymol.
max_refine_rmsd: float: Maximum RMSD between the superimposed apo atoms after refinement cycles.
min_aligned_apo_res_frac: float: Minimum fraction of holo residues that are covered by the apo monomer superposition.
min_seq_identity: int: Minimum sequence identity between monomer and holo monomer to consider for apo pairing.
max_interface_miss_frac: float: Maximum fraction of holo interface residues that can be missing in the apo monomer.
max_frac_monomer_dimer_sequence: float: Maximum fraction of full holo dimer sequence represented by the single-body apo monomer. See PDB 2G3D (holo) and 1YJF for an example where this is needed.
invalid_coverage_upper_bound: float: Upper bound on ratio of the number of apo interface residues after superimposing to the counterpart holo monomer vs the holo interface residues for the monomer that it is being paired to before being considered invalid domain coverage.
invalid_coverage_lower_bound: float: Lower bound on ratio of the number of apo interface residues after superimposing to the counterpart holo monomer vs the holo interface residues for the monomer that it is being paired to before being considered invalid domain coverage.
scaled_score_metrics: tuple[str]: Metrics to use when constructing a scaled score for selecting a single canonical apo monomer for receptor and ligand holo monomers.

apo_chain: str = 'A'#

contact_rad: float = 10.0#

backbone_only: bool = False#

heavy_only: bool = False#

min_atom_types: int = 3#

min_residues: int = 5#

min_holo_resolved_frac: float = 0.3#

align_method: str = 'pymol'#

max_refine_rmsd: float = 10.0#

min_aligned_apo_res_frac: float = 0.7#

min_seq_identity: float = 0.3#

max_interface_miss_frac: float = 0.3#

max_frac_monomer_dimer_sequence: float = 0.75#

invalid_coverage_upper_bound: float = 2.0#

invalid_coverage_lower_bound: float = 0.5#

scaled_score_metrics: tuple[str, str, str, str, str] = ('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')#

class pinder.data.config.IalignConfig(rmsd_threshold: float = 5.0, log_pvalue_threshold: float = -9.0, is_score_threshold: float = 0.3, alignment_printout: int = 0, speed_mode: int = 1, min_residues: int = 5, min_interface: int = 5, distance_cutoff: float = 10.0, output_prefix: str = 'output')[source][source]#

Bases: object

Configuration parameters for evaluating potential alignment leakage via iAlign.

Attributes:

rmsd_threshold: float: The maximum RMSD reported by iAlign for considering an interface pair as similar.
log_pvalue_thresholdfloat: The maximum log P-value reported by iAlign for considering an interface pair as similar.
is_score_thresholdbool: The minimum IS-score value reported by iAlign for considering an interface pair as similar.
alignment_printoutint: The -a flag to pass to ialign.pl. 0 - no alignment printout, 1 - concise, 2 - detailed.
speed_modeint: The -q flag to pass to ialign.pl. 1 - normal (default), 2 - fast.
min_residuesint: The -minp flag to pass to ialign.pl. Minimum number of residues for a protein chain.
min_interfaceint: The -mini flag to pass to ialign.pl. Minimum number of residues for an interface.
distance_cutofffloat: The -dc flag to pass to ialign.pl. Distance cutoff for an interfacial contact, default 10.0 A.
output_prefix: str: The -w flag to pass to ialign.pl. Workpath or path to parsed PDB files.

rmsd_threshold: float = 5.0#

log_pvalue_threshold: float = -9.0#

is_score_threshold: float = 0.3#

alignment_printout: int = 0#

speed_mode: int = 1#

min_residues: int = 5#

min_interface: int = 5#

distance_cutoff: float = 10.0#

output_prefix: str = 'output'#

pinder.data.config.get_config_hash(config_obj: ContactConfig | GraphConfig) → str[source][source]#

pinder.data.csv_utils module#

pinder.data.csv_utils.read_csv_non_default_na(csv_file: Path, sep: str = ',', dtype: DtypeArg | None = None, **kwargs: Any) → pd.DataFrame[source][source]#

Read a csv file into pandas DataFrame without casting NA to NaN.

Handle cases like asym_id = NA, which should NOT be cast to NaN!

This method sets keep_default_na to False and passes na_values with all of the default values listed in https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html except for NA.

Parameters:

csv_filePath: Path to tabular data to read.
sepstr: Character or regex pattern to treat as the delimiter. Defaults to ‘,’. If sep=None, the C engine cannot automatically detect the separator, but the Python parsing engine can, meaning the latter will be used and automatically detect the separator from only the first valid row of the file by Python’s builtin sniffer tool, csv.Sniffer.
dtypedtype or dict of {Hashabledtype}, optional: Data type(s) to apply to either the whole dataset or individual columns. E.g., {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} Use str or object together with suitable na_values settings to preserve and not interpret dtype.
**kwargsAny: Any additional kwargs are passed to pd.read_csv.

Returns:

pd.DataFrame

pinder.data.csv_utils.safe_read_csv(csv_file: Path, sep: str = ',') → DataFrame[source][source]#

pinder.data.csv_utils.parallel_read_csvs(csv_files: list[Path], max_workers: int | None = None, sep: str = ',', parallel: bool = True) → list[DataFrame] | None[source][source]#

pinder.data.find_transitive_hits module#

pinder.data.find_transitive_hits.get_potential_representatives(metadata: DataFrame, config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000)) → tuple[DataFrame, DataFrame][source][source]#

Get potential representatives from the metadata based on specified criteria.

Parameters:

metadata (pd.DataFrame) – The metadata containing information about the dataset.
config (ClusterConfig) – The ClusterConfig object containing config for selecting the test set.

Returns:

A tuple containing two DataFrames:

The first DataFrame contains the potential representatives based on the specified criteria.
The second DataFrame contains the potential representatives after a specific date.

Return type:

tuple[pd.DataFrame, pd.DataFrame]

pinder.data.find_transitive_hits.get_test_conversion_dicts(test_index: DataFrame, cluster_key: str = 'cluster_id') → tuple[set[str], dict[tuple[int, int], str]][source][source]#

Convert the test index data into a set of test system IDs and a dictionary mapping cluster IDs to test system IDs.

Parameters:

test_index (pandas.DataFrame) – The test index data.
cluster_key (str, optional) – The column name for the cluster ID. Defaults to “cluster_id”.

Returns:

A tuple containing the set of test system IDs and the dictionary mapping cluster IDs to test system IDs.

Return type:

tuple

pinder.data.find_transitive_hits.get_proto_splits_pindex(index: DataFrame, metadata: DataFrame, cluster_key: str = 'cluster_id', config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000)) → DataFrame[source][source]#: Get the test-train split for the index based on the cluster_id

pinder.data.find_transitive_hits.get_leakage_dict(pinder_dir: Path, graph_type: str, config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), use_cache: bool = True, af2_transitive_hits: bool = False) → None[source][source]#

pinder.data.find_transitive_hits.get_transitive_hits(pinder_dir: Path, config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), test_systems_output: str = 'test_sys_table.csv', deleak_map_output: str = 'transitive_hits_mapping.csv', use_cache: bool = True, af2_transitive_hits: bool = False) → None[source][source]#

pinder.data.find_transitive_hits.cluster_leaks(source: str, graph: Graph, node_to_cluster: dict[str, int], depth: int) → set[str][source][source]#

Find all nodes x in a graph such that:

there exists a path of length l <= <depth> from source to x
source and x are in different clusters

Parameters:

source: str: The source node
graph: nx.Graph: The target graph. Must contain <node>
node_to_cluster: Dict[str, int]: Map from nodes to cluster IDs
depth: int: Maximum allowed path length.

Returns:

Set[str]: The set of nodes in the “<depth>-neighborhood” of source

pinder.data.find_transitive_hits.batch_cluster_leaks(source_set: set[str], graph: Graph, node_to_cluster: dict[str, int], depth: int, max_node_degree: int | None = 1000) → dict[str, set[str]][source][source]#

Find cluster leaks for all nodes in node_set.

For each node, finds the set of all neighbors in graph within depth hops of node that are in different clusters.

Applies checks to limit computation. If a node fails any check, the corresponding value will be CONSIDER_LEAKED.

Parameters:

source_set: Set[str]: A set of target sources
graph: nx.Graph: The target graph. Must contain all nodes in <node_set>
node_to_cluster: Dict[str, int]: Map from nodes to cluster IDs
depth: int: Maximum allowed path length.
max_node_degree: int: Maximum allowed node degree. Default 1000

Returns:

Dict[str, Set[str]]: Mapping from a node to the set of nodes in the node’s “<depth>-neighborhood”

pinder.data.find_transitive_hits.get_leak_map_for_id(pure_id: str, pure_map_forward: dict[str, tuple[str, str]], pure_map_backward: dict[frozenset[str], set[str]], corrupt_map_backward: dict[frozenset[str], set[str]], all_pure_pairs: set[frozenset[str]], all_corrupt_pairs: set[frozenset[str]], potential_leaks: dict[str, set[str]]) → set[str][source][source]#: Do the multi-step mapping from: pure system -> pure pair of nodes pure pair of nodes -> possibly corrupt leaks for each node possibly corrupt leaks -> corrupt systems (by intersection)

pinder.data.find_transitive_hits.map_leak_pairs(pure_split: set[str], pure_map_forward: dict[str, tuple[str, str]], pure_map_backward: dict[frozenset[str], set[str]], corrupt_map_backward: dict[frozenset[str], set[str]], all_pure_pairs: set[frozenset[str]], all_corrupt_pairs: set[frozenset[str]], potential_leaks: dict[str, set[str]]) → dict[str, set[str]][source][source]#

pinder.data.find_transitive_hits.find_split_leakage(pure_split: set[str], corrupt_split: set[str], graph: Graph, node_to_cluster: dict[str, int], depth: int, edge_threshold: float = 0.65, max_node_degree: int = 1000, potential_leaks_chkpt: Path | None = None, use_cache: bool = True) → dict[str, set[str]][source][source]#

Find leakage between transitive neighbors of two putative splits.

For systems in pure_split, determine whether there are systems in corrupt_split that leak into the pure_split.

Parameters:

pure_split – Set[str] The set of PINDER System IDs in the pure split (e.g., test).
corrupt_split – Set[str] The set of PINDER System IDs in the corrupt split (e.g., train).
graph – nx.Graph The foldseek-similarity monomer graph.
node_to_cluster – Dict[str, int] The map from graph nodes to cluster IDs.
depth – int The maximum path length to travel looking for leakage.
edge_threshold – float, optional The threshold for considering edges in the graph. Default is 0.65.
max_node_degree – int, optional The node degree at which we assume there is leakage. This is to save compute. Default is 1000.
potential_leaks_chkpt – Path, optional The path to checkpoint potential leaks. Default is None.
use_cache – bool, optional Whether to use cached results. Default is True.

Returns:

Dict[str, Set[str]]: The map from pure_split IDs to corrupt_split IDs, indicating leakage.

Note

A system c:= {u, v} in corrupt_split is a leaking system for system p:= {s, t} in pure_split iff any of the following are true:

All of the following are true:

u and s are in different clusters

There exists a path between u and s of length <= depth

v and t are in different clusters

There exists a path between v and t of length <= depth

All of the following are true:

u and t are in different clusters

There exists a path between u and t of length <= depth

v and s are in different clusters

There exists a path between v and s of length <= depth

Method sequence:

Create a map from test systems to graph node pairs

Do any filtering required on the graph

Find cluster_leaks for all nodes in these pairs

For train systems, create a nested map from graph node pairs to sets of train systems

For each system x in test:

Map x to a graph test pair

Find all graph train pairs containing at least one member of the graph test pair using the “adjacency map”

Map from these graph train pairs to train systems

These are the inter_split connections

pinder.data.find_transitive_hits.map_systems_to_fsid_pairs(system_ids: set[str]) → dict[str, tuple[str, str]][source][source]#: Get a map from PINDER System ID strings to pairs of Foldseek monomer ID strings

pinder.data.find_transitive_hits.map_fsid_pair_to_systems(system_ids: set[str]) → dict[frozenset[str], set[str]][source][source]#

Construct a hierarchical map from Foldseek monomer ID pairs to PINDER System IDs

This map is insensitive to order. Avoid using defaultdict, which is actually quite slow

pinder.data.find_transitive_hits.deep_merge_dict(a: dict[str, set[str]], b: dict[str, set[str]]) → dict[str, set[str]][source][source]#: Merge two dictionaries in which values are Set objects using set.update

pinder.data.find_transitive_hits.intersection_unordered_product(A: set[str], B: set[str], C: set[frozenset[str]]) → set[frozenset[str]][source][source]#

Compute the intersection between C and the unordered set product of A and B:= U.

Attempts to do this efficiently by determining the size of U before actually computing U, then iterating over the smaller of U or C, while checking inclusion in the larger.

Specifically:

determines the size of U, the unordered cartesian product of A and B
if U is smaller than C, computes U and iterates over it, checking inclusion in C.
if U is larger than C, iterates over C and checks inclusion in A and B.

NB: our elements are of size 1 or 2, so to check inclusion we use 0 or -1 as our indices. If our elements were of another size, this code would not work!

Parameters:

A: set[str]: The first set in the possible product
B: set[str]: The second set in the possible product
C: set[frozenset[str]]: The set that we are intersecting with A x B

Returns:

set[frozenset[str]: The set intersection between C and AxB (unordered)

pinder.data.find_transitive_hits.unordered_set_product(A: set[str], B: set[str]) → Iterable[tuple[str]][source][source]#

Compute the unordered cartesian product of sets A and B.

We define the unordered set product \(U\) as a subset of \(P := A imes B\), where if \(x := (a, b)\) is in \(U\), then \(y := (b, a)\) is not in \(U\), even if \(y\) is in \(P\).

The goal of this method is to compute the unordered set product of A and B in the most efficient way possible, ideally without computing the entire product.

Given two sets, A and B, with intersection \(|A \cap B| := I\), where \(|A| = n\), \(|B| = m\), and \(|I| = i\). The cartesian product of A and B has size \(n \times m\), but the unordered cartesian product may be smaller.

To compute the unordered cartesian product, we want the union of:

combinations with replacement, length 2 of \(I\) \(\left( \binom{i+1}{2} = \frac{(i+1) \cdot i}{2} \right)\)

product of \(A \setminus I\) with \(I\) \(\left( (n-i) \cdot i \right)\)

product of \(B \setminus I\) with \(I\) \(\left( (m-i) \cdot i \right)\)

product of \(A \setminus I\) with \(B \setminus I\) \(\left( (n-i) \cdot (m-i) \right)\)

The size of this union is \(n \times m - \frac{i^2 - i}{2}\).

Parameters:

Aset[str]: The first product set.
Bset[str]: The second product set.

Returns:

set[tuple[str]]: The unordered cartesian product of A and B.

pinder.data.find_transitive_hits.len_unordered_set_product(A: set[str], B: set[str]) → int[source][source]#

Compute the size of the unordered cartesian product of sets A and B.

We define the unordered set product U as a subset of P:=product(A, B), where if x:=(a,b) is in U, then y:=(b,a) is not in U, even if y is in P.

Given two sets, A and B, with intersection \(|A \cap B| = I\), where \(|A| = n\), \(|B| = m\), and \(|I| = i\). The cartesian product of A and B has size n*m, but the unordered cartesian product may be smaller.

To compute the unordered cartesian product, we want the union of:

combinations with replacement, length 2 of \(I\) \(\left( \binom{i+1}{2} = \frac{(i+1) \cdot i}{2} \right)\)

product of \(A \setminus I\) with \(I\) \(\left( (n-i) \cdot i \right)\)

product of \(B \setminus I\) with \(I\) \(\left( (m-i) \cdot i \right)\)

product of \(A \setminus I\) with \(B \setminus I\) \(\left( (n-i) \cdot (m-i) \right)\)

The size of this union is \(n \times m - \frac{i^2 - i}{2}\). Note that \((i**2 - i))\) is always even for all integer i.

Parameters:

A: set[str]: The first product set
B: set[str]: The second product set

Returns:

int: The size of AxB (unordered)

pinder.data.foldseek_utils module#

pinder.data.foldseek_utils.fasta2dict(fasta_file: Path) → dict[str, str][source][source]#

pinder.data.foldseek_utils.create_fasta_from_systems(systems: list[PinderSystem], fasta_file: Path | str) → None[source][source]#

pinder.data.foldseek_utils.extract_fasta_from_pdb(pdb_file: Path) → str[source][source]#

pinder.data.foldseek_utils.parallel_extract_fasta(pdb_files: list[Path], max_workers: int | None = None, parallel: bool = True) → list[str][source][source]#

Extract fasta-formatted sequences from a collection of PDB files in parallel. Operates in parallel and assumes that source files all exist.

Parameters:

pdb_fileslist[Path]: List of PDB files to extract fasta strings for.
max_workersint, optional: Limit number of parallel processes spawned to max_workers.
parallelbool: Whether to extract fasta in parallel.

pinder.data.foldseek_utils.create_fasta_from_foldseek_inputs(foldseek_dir: Path, fasta_file: Path | str, max_workers: int | None = None, use_cache: bool = True, parallel: bool = True) → None[source][source]#

pinder.data.foldseek_utils.create_foldseek_input_dir(index: Path | str, foldseek_dir: Path | str, pdb_dir: Path | str, use_cache: bool = True, max_workers: int | None = None, parallel: bool = True) → None[source][source]#

pinder.data.foldseek_utils.run_foldseek(input_dir: Path, output_dir: Path, target_db_dir: Path | None = None, config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt')) → None[source][source]#

Run foldseek easy-search on a directory of PDB structures.

Parameters:

input_dirPath: Input directory for foldseek targets.
output_dirPath: The output directory to store foldseek alignments.
target_db_dirOptional[Path]: Optional target DB input directory for foldseek. If not specified, defaults to input_dir.
configFoldseekConfig: The configuration object containing foldseek parameters.

pinder.data.foldseek_utils.run_mmseqs(input_fasta: Path, output_dir: Path, target_fasta: Path | None = None, use_cache: bool = True, config: MMSeqsConfig = MMSeqsConfig(sensitivity=11.0, evalue=0.05, score_type='pident', min_seq_id=0.2, max_seqs=1000, alignment_filename='alignment.txt')) → None[source][source]#

pinder.data.foldseek_utils.create_dbs(db_root_path: Path | str, chains_path: Path | str, db_size: int = 50000, max_workers: int | None = None, use_cache: bool = True, parallel: bool = True) → None[source][source]#

pinder.data.foldseek_utils.run_db_vs_db(db_path: Path, i: int, j: int, config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt'), use_cache: bool = True) → Path[source][source]#

pinder.data.foldseek_utils.run_foldseek_db_pair(pinder_dir: Path, db_indices: tuple[int, int], foldseek_config: FoldseekConfig, use_cache: bool = True) → None[source][source]#

pinder.data.foldseek_utils.create_dbs_and_run(fold_db_path: Path | str, chains_path: Path | str, db_size: int = 50000, config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt')) → None[source][source]#

pinder.data.foldseek_utils.setup_foldseek_dbs(pinder_dir: Path, foldseek_db_size: int = 50000, use_cache: bool = True) → None[source][source]#

pinder.data.foldseek_utils.collate_foldseek_alignments(pinder_dir: Path, foldseek_db_size: int = 50000, use_cache: bool = True, alignment_filename: str = 'alignment.txt') → None[source][source]#

pinder.data.foldseek_utils.collate_mmseqs_alignments(pinder_dir: Path, foldseek_db_size: int = 50000, use_cache: bool = True, alignment_filename: str = 'alignment.txt') → None[source][source]#

pinder.data.foldseek_utils.aln_to_df(alignment_file: Path, colnames: list[str], score_col: str, default_score_val: float = 0.5) → DataFrame[source][source]#

pinder.data.foldseek_utils.filter_foldseek_edges(aln_df: DataFrame, graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5)) → DataFrame[source][source]#

pinder.data.foldseek_utils.filter_mmseqs_edges(aln_df: DataFrame, graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5)) → DataFrame[source][source]#

pinder.data.foldseek_utils.alignment_to_parquet(alignment_file: Path, alignment_type: str, foldseek_config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt'), mmseqs_config: MMSeqsConfig = MMSeqsConfig(sensitivity=11.0, evalue=0.05, score_type='pident', min_seq_id=0.2, max_seqs=1000, alignment_filename='alignment.txt'), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), use_cache: bool = True, remove_original: bool = True) → None[source][source]#

pinder.data.foldseek_utils.run_foldseek_on_pinder_chains(pdb_dir: Path, index: str = 'index.1.csv.gz', foldseek_dir: Path = PosixPath('/tmp/foldseek'), config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt')) → None[source][source]#

Runs foldseek on the PINDER dataset.

You may need to set your PINDER_DATA_DIR environment variable to the location of the development PINDER dataset.

Parameters:

pdb_dirPath: Input directory containing pinder PDBs to use for populating foldseek inputs.
indexstr: The Pinder index CSV file name.
foldseek_dirPath: The directory for storing foldseek input PDBs. Defaults to /tmp/foldseek.
configFoldseekConfig: The configuration object containing foldseek parameters.

pinder.data.foldseek_utils.run_mmseqs_on_pinder_chains(pdb_dir: Path, index: str = 'index.1.csv.gz', output_dir: Path = PosixPath('/tmp/foldseek'), use_cache: bool = True, config: MMSeqsConfig = MMSeqsConfig(sensitivity=11.0, evalue=0.05, score_type='pident', min_seq_id=0.2, max_seqs=1000, alignment_filename='alignment.txt')) → None[source][source]#

Runs mmseqs easy-search on the PINDER dataset.

You may need to set your PINDER_DATA_DIR environment variable to the location of the development PINDER dataset.

Parameters:

pdb_dirPath: Input directory for foldseek
indexstr: The Pinder index CSV file name.
output_dirPath: The output directory containing foldseek input PDBs. Defaults to /tmp/foldseek.
configMMSeqsConfig: The configuration object containing mmseqs parameters.

pinder.data.foldseek_utils.setup_mmseqs_dbs(pinder_dir: Path, mmseqs_db_size: int = 50000, use_cache: bool = True) → None[source][source]#

pinder.data.foldseek_utils.run_mmseqs_db_pair(pinder_dir: Path, db_indices: tuple[int, int], mmseqs_config: MMSeqsConfig = MMSeqsConfig(sensitivity=11.0, evalue=0.05, score_type='pident', min_seq_id=0.2, max_seqs=1000, alignment_filename='alignment.txt'), use_cache: bool = True) → None[source][source]#

pinder.data.get_alignment_similarity module#

pinder.data.get_alignment_similarity.add_alignment_cols_to_index(index: DataFrame) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.get_hit_interfaces(pinder_dir: Path, hits: DataFrame, config: ContactConfig = ContactConfig(heavy_only=True, backbone_only=True, backbone_definition='dockq', radius=10.0, only_unique_resi=True, min_length=3)) → dict[tuple[str, str], Interface][source][source]#

pinder.data.get_alignment_similarity.reformat_hits(hits: DataFrame, ref_chains: set[str]) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.get_ref_alignment_hits(index: DataFrame, pinder_dir: Path, alignment_type: str, pinder_subset: str, use_cache: bool = True) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.get_subset_interfaces(index: DataFrame, hits: DataFrame, interfaces: dict[tuple[str, str], Interface], pinder_subset: str) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.get_subset_hits(index: DataFrame, metadata: DataFrame, pinder_dir: Path, alignment_type: str, pinder_subset: str) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.get_paired_hits(index: DataFrame, metadata: DataFrame, pinder_dir: Path, pinder_subset: str) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.find_potential_leaks(index: DataFrame, metadata: DataFrame, pinder_dir: Path = PosixPath('/home/runner/.local/share/pinder/2024-02'), ialign_batch_size: int = 20000, max_workers: int | None = None, use_cache: bool = True, config: IalignConfig = IalignConfig(rmsd_threshold=5.0, log_pvalue_threshold=-9.0, is_score_threshold=0.3, alignment_printout=0, speed_mode=1, min_residues=5, min_interface=5, distance_cutoff=10.0, output_prefix='output')) → DataFrame[source][source]#

pinder.data.get_alignment_similarity.get_alignment_similarity(pinder_dir: Path, cluster_config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), ialign_config: IalignConfig = IalignConfig(rmsd_threshold=5.0, log_pvalue_threshold=-9.0, is_score_threshold=0.3, alignment_printout=0, speed_mode=1, min_residues=5, min_interface=5, distance_cutoff=10.0, output_prefix='output'), use_cache: bool = True) → None[source][source]#

pinder.data.get_annotations module#

pinder.data.get_annotations.annotate_pisalite(path_to_pdb: Path, use_cache: bool = True) → None[source][source]#

Annotate PDB entry with PDBe PISA Lite service.

This function will make two REST API queries to the PDBe PISA Lite service and save the results as JSON files in the directory containing the PDB entry. It will create two files in the directory: - {pdb_id}-pisa-lite-assembly.json - {pdb_id}-pisa-lite-interfaces.json

Parameters:

path_to_pdbPath: Path to the directory containing the PDB entry. PDB ID is expected to be encoded in the directory name.
use_cachebool: Whether to skip request if a checkpoint file with name checkpoint-pisa.txt exists in the path_to_pdb directory.

pinder.data.get_annotations.annotate_complex(args: tuple[Path, float], use_cache: bool = True) → None[source][source]#

This function annotates a protein complex dimer.

Parameters:

argsTuple[Path, float]: The path to the protein complex and the radius for the annotation process.
use_cachebool: Whether to skip calculations if the annotation output tsv exists.

Returns:

None: This function does not return any value. It annotates the protein complex with: - crystal contacts - number of disconnected components

pinder.data.get_annotations.get_pisa_annotations(mmcif_list: list[Path], parallel: bool = True, max_workers: int | None = None, config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500), use_cache: bool = True) → None[source][source]#

This function fetches PISA annotations for list of PDB entries.

Parameters:

mmcif_listPath: The list of mmcif entry files to process in a batch.
parallelbool: If True, files will be processed in parallel.
max_workersint | None: If specified, limits number of processes to spawn in parallel mode.
configPinderDataGenConfig: Configuration parameters for dataset generation.

Returns:

None: This function does not return any value. It processes the PDB files in the given directory and saves the results.

pinder.data.get_annotations.get_dimer_annotations(dimer_list: list[Path], parallel: bool = True, max_workers: int | None = None, config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500), use_cache: bool = True) → None[source][source]#

This function annotates a list of dimer PDB files.

Parameters:

dimer_listPath: The list of dimer PDB files to process in a batch.
max_workersint | None: If specified, limits number of processes to spawn in parallel mode.
configPinderDataGenConfig: Configuration parameters for dataset generation.

Returns:

None: This function does not return any value. It processes the PDB files in the given directory and saves the results.

pinder.data.get_annotations.get_annotations(data_dir: Path, two_char_code: str | None = None, parallel: bool = True, max_workers: int | None = None, config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500)) → None[source][source]#

This function gets annotations for a given PDB directory.

Parameters:

pdb_dirPath: The path to the directory containing the PDB files.
two_char_codestr, optional: The two character code for the PDB files, by default None.
parallelbool: If True, files will be processed in parallel.
max_workersint | None: If specified, limits number of processes to spawn in parallel mode.
configPinderDataGenConfig: Configuration parameters for dataset generation.

Returns:

None: This function does not return any value. It processes the PDB files in the given directory and saves the results.

pinder.data.get_annotations.pisa_json_to_dataframe(json_file: Path) → DataFrame[source][source]#: Convert the PISA JSON file to a single-row Pandas DataFrame.

pinder.data.get_annotations.collect_metadata(pdb_entries: list[Path], include_pisa: bool = False) → DataFrame[source][source]#

Collect metadata from PDB entries.

Parameters:

pdb_entriesList[Path]: List of paths to PDB entries.
include_pisabool: Whether to include PISA annotations. Default is False.

Returns:

pd.DataFrame: DataFrame containing metadata for each PDB entry.

pinder.data.get_annotations.collect_interacting_chains(pdb_entries: list[Path]) → DataFrame[source][source]#

Collect interacting chains from PDB entries.

Parameters:

pdb_entriesList[Path]: List of paths to PDB entries.

Returns:

pd.DataFrame: DataFrame containing interacting chains for each PDB entry.

pinder.data.get_annotations.collect_annotations(pdb_entries: list[Path]) → DataFrame[source][source]#

Collect annotations from PDB entries.

Parameters:

pdb_entriesList[Path]: List of paths to PDB entries.

Returns:

pd.DataFrame: DataFrame containing annotations for each PDB entry.

pinder.data.get_annotations.collect(data_dir: Path, pinder_dir: Path) → None[source][source]#

Collect annotations, metadata and generate index.

This function is responsible for collecting annotations, metadata and generating an index.

Parameters:

data_dirPath: The directory where the data is stored.
pinder_dirPath: The directory where the Pinder data is stored.

Returns:

None

pinder.data.get_apo module#

pinder.data.get_apo.get_valid_apo_monomer_ids(pinder_dir: Path, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), max_workers: int | None = None, use_cache: bool = True, remove_chain_copies: bool = True, parallel: bool = True) → None[source][source]#

Validates and stores a list of valid apo monomer IDs based on specific criteria defined in the configuration.

This function processes monomer IDs to determine which qualify as valid apo monomers based on atom types and residue counts. Results are saved to a Parquet file, and processing is skipped if the file already exists and caching is enabled.

Parameters:

pinder_dir (Path) – The directory that contains monomer data and where the output will be stored.
config (ApoPairingConfig, optional) – Configuration containing the validation thresholds.
max_workers (int | None, optional) – The maximum number of worker processes for parallel computation.
use_cache (bool, optional) – If True, skips processing if the output file already exists.
remove_chain_copies (bool, optional) – If True, removes duplicate chain entries before processing.

pinder.data.get_apo.get_putative_pairings(pinder_dir: Path, use_cache: bool = True, remove_chain_copies: bool = False) → DataFrame[source][source]#

Generates a DataFrame of putative apo-holo pairings from validated apo monomer IDs.

This function loads validated apo monomer IDs and pairs them with corresponding holo structures. The pairing is done based solely on Uniprot ID of the holo and apo monomer, respectively. Results are stored in a Parquet file and cached if enabled.

Parameters:

pinder_dir (Path) – Directory containing the validated apo monomer IDs and holo structures.
use_cache (bool, optional) – If True, returns cached pairings from a Parquet file if available.
remove_chain_copies (bool, optional) – If True, removes duplicate chain entries before pairing.

Returns:

A DataFrame containing putative apo-holo pairings.

Return type:

pd.DataFrame

pinder.data.get_apo.get_apo_pairing_metrics(pinder_dir: Path, putative_pairs: list[str] | DataFrame, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), max_workers: int | None = None, output_parquet: Path | None = None, use_cache: bool = True, parallel: bool = True) → DataFrame[source][source]#

Retrieves or calculates apo-holo pairing metrics from specified pair identifiers or DataFrame.

This function processes pairings to calculate various metrics that help assess the suitability of apo-holo pairings. If caching is enabled and a valid cache file exists, the function returns the data from the cache.

Parameters:

pinder_dir (Path) – Base directory containing the data.
putative_pairs (list[str] | pd.DataFrame) – Either a list of pairing identifiers or a DataFrame with pairings.
config (ApoPairingConfig) – Configuration settings for the pairing analysis.
max_workers (int | None) – Maximum number of worker processes for parallel computation.
output_parquet (Path | None) – Path to a Parquet file where results are stored or retrieved.
use_cache (bool) – Whether to use cached results if available.

Returns:

A DataFrame containing metrics for each pair.

Return type:

pd.DataFrame

pinder.data.get_apo.select_potential_apo(pinder_dir: Path, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), parallel: bool = True, max_workers: int | None = None) → DataFrame[source][source]#

Selects potential apo structures based on various metrics from a metrics DataFrame stored in a Parquet file. Additional metrics are calculated and hard filters are applied based on config.

Parameters:

pinder_dir (Path) – The directory where the data is stored.
config (ApoPairingConfig) – Configuration used to determine selection criteria.

Returns:

A DataFrame of selected potential apo structures based on defined criteria.

Return type:

pd.DataFrame

pinder.data.get_apo.get_apo_monomer_weighted_score(apo_data: DataFrame, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), scale_type: str = 'standard') → DataFrame[source][source]#

Uses suite of apo-holo difficulty assessment metrics to compute a weighted score that is used to rank and select a single receptor and single ligand monomer for a given pinder dimer entry when apo structures are available.

Parameters:

apo_data (pd.DataFrame) – Data containing metrics for each apo monomer.
config (ApoPairingConfig) – Configuration containing the metrics and their weights.
scale_type (str) – Type of scaling to apply, ‘standard’ for Z-score or ‘minmax’ for Min-Max scaling.

Returns:

The input DataFrame with an additional column ‘apo_score’ containing the computed scores.

Return type:

pd.DataFrame

pinder.data.get_apo.add_weighted_apo_score(potential_apo: DataFrame, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))) → DataFrame[source][source]#

Adds a weighted score to each potential apo structure to facilitate the selection of the most suitable structure.

Parameters:

potential_apo (pd.DataFrame) – DataFrame containing the potential apo structures.
config (ApoPairingConfig) – Configuration settings for the scoring system.

Returns:

The DataFrame with an additional column representing the weighted score of each entry.

Return type:

pd.DataFrame

pinder.data.get_apo.run_monomer_dimer_mmseqs(pinder_dir: Path, potential_apo: DataFrame, use_cache: bool = True) → None[source][source]#

Executes MMseqs2 to compare sequence similarities between monomers and dimers and caches the results. This method acts as a second layer of validation on the original pairing algorithm. The alignment file can be used to calculate an alternative metric akin to calculate_frac_monomer_dimer_overlap. The usage of the mmseqs outputs are currently experimental and not used in the pairing or final selection.

Parameters:

pinder_dir (Path) – Directory where data is stored and from which MMseqs2 is run.
potential_apo (pd.DataFrame) – DataFrame containing potential apo structures to compare.
use_cache (bool) – If True, uses cached results if available.

Returns:

Results are saved to files and not directly returned.

Return type:

None

pinder.data.get_apo.add_all_apo_pairings_to_index(pinder_dir: Path, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

Adds all validated apo pairings to the pinder index.

The index is updated with boolean columns for apo_R/L to indicate whether an apo monomer exists for the pinder dimer entry. A canonical apo_R_pdb and apo_L_pdb is selected to use downstream when evaluating methods on the test set. Alternative apo structures are stored in a semi-colon separated string in the apo_R_pdbs and apo_L_pdbs column for optional usage during e.g. training.

Parameters:

pinder_dir (Path) – Directory containing all necessary datasets and configuration files.
config (ApoPairingConfig) – Configuration settings that dictate the process.
use_cache (bool) – If True, will not reprocess if the result is already calculated and stored.

Returns:

The results are saved in the index and no value is returned.

Return type:

None

pinder.data.get_apo.collate_apo_metrics(metric_dir: Path, output_parquet: Path) → None[source][source]#

Collates individual metric files into a single Parquet file for easier management and access.

Parameters:

metric_dir (Path) – Directory containing the individual metric Parquet files.
output_parquet (Path) – Path to the output Parquet file where the collated metrics will be stored.

Returns:

The results are written directly to a Parquet file.

Return type:

None

pinder.data.get_clusters module#

pinder.data.get_clusters.get_dimer_to_cluster_pair(interfaces: dict[tuple[str, str], Interface], node_to_cluster: dict[str, int]) → dict[tuple[str, str], tuple[int, int]][source][source]#

Get a mapping from interface dimers to cluster pairs.

The value (-1, -1) corresponds to the null cluster.

Parameters:

interfaces: Dict[Tuple[str,str], Interface]: Mapping (dict) from dimers, represented as a tuple of monomer PINDER IDs to Interface objects representing the dimer interface.
node_to_cluster: Dict[str, int]: Mapping (dict) from monomers, represented as a single foldseek monomer ID to monomer cluster IDs.

Returns:

dict[tuple[str, str], tuple[int, int]]:: Mapping (dict) from dimers, represented as a tuple of monomer PINDER IDs to monomer cluster ID pairs.

pinder.data.get_clusters.add_clusters_to_index(index: DataFrame, dimer_to_cluster_pair: dict[tuple[str, str], tuple[int, int]], name: str) → None[source][source]#

Add cluster pairs created from graph clustering to an existing index

IMPORTANT: We define a cluster pair as a tuple of integers: (x, y) | x <= y

In order to preserve the ability to back out which monomer goes with which cluster, we also store R- and L-specific cluster IDs

Parameters:

index: pd.DataFrame: The existing index, containing the “id” field, at least
dimer_to_cluster_pair: dict[tuple[str, str], tuple[int, int]]: A mapping from dimers: pair of monomers in pinder naming format to cluster id pairs.

Returns:

None. Mutates index

pinder.data.get_clusters.choose_final_clusters(index: DataFrame, cluster_prefix: str) → None[source][source]#

Choose cluster assignments to use as the “final” cluster assignments

Copies one set of cluster ID columns with the prefix: <cluster_prefix> to a new set of columns with the prefix: “cluster”.

E.g., if <cluster_prefix> is “foldseek_community”, copies the columns

[“foldseek_community_id_R”,: “foldseek_community_id_L”, “foldseek_community_id”]

to the columns

[“cluster_id_R”,: “cluster_id_L”, “cluster_id”]

pinder.data.get_clusters.load_cluster_cache(cache_pkl: Path) → list[set[str]][source][source]#

pinder.data.get_clusters.save_cluster_cache(cluster_data: list[set[str]], cache_pkl: Path) → None[source][source]#

pinder.data.get_clusters.cluster(index: DataFrame, foldseek_graph: Graph, mmseqs_graph: Graph | None, interfaces_clean: dict[tuple[str, str], Interface], output_index_filename: str = 'index.2.csv.gz', checkpoint_dir: Path = PosixPath('/tmp/clust_chkpt'), config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), foldseek_components: list[set[str]] | None = None, foldseek_communities: list[set[str]] | None = None, mmseqs_components: list[set[str]] | None = None, use_cache: bool = True) → DataFrame[source][source]#

Cluster interfaces based on FOLDSEEK or MMSEQS alignments-based graph

Parameters:

index: pd.DataFrame

The input PINDER index file

foldseek_graph: nx.Graph

The foldseek-similarity-based interface graph. This graph should:

contain monomer nodes
contain weighted edges that indicate interface similarity
have been pre-filtered or “cleaned” as desired.

mmseqs_graph: nx.Graph | None

Optional: The mmseqs-similarity-based interface graph. This graph should:

contain monomer nodes
contain weighted edges that indicate interface similarity
have been pre-filtered or “cleaned” as desired.

interfaces_clean: dict[tuple[str, str], Interface]

Dictionary mapping dimers to min. length-filtered interfaces.

output_index_file: str

Name of the updated index file. For example: “index.2.csv.gz”

checkpoint_dir: Path

Directory in which to save checkpoints.

seed: int | np.random.RandomState

The random seed to use for AsynLPA clustering.

edge_weight: str | None

The edge attribute for nx.Graph inputs representing the weight of an edge. If None, uses 1 for all weights. Used for AsynLPA clustering. Defaults to “weights”.

foldseek_components: dict[tuple[int, int], set[tuple[tuple[str, str], bool]]]

Mapping from component id to dimer index + flag indicating whether it has been sorted

foldseek_communities: dict[tuple[int, int], set[tuple[tuple[str, str], bool]]]

Mapping from community id to dimer index + flag indicating whether it has been sorted

mmseqs_components: dict[tuple[int, int], set[tuple[tuple[str, str], bool]]]

Mapping from component id to dimer index + flag indicating whether it has been sorted

canonical_method: str

name of the “primary” clustering method

Returns:

pd.DataFrame: The input index, with additional fields indicating component and community IDs
Also writes this DataFrame to file

pinder.data.get_data module#

pinder.data.get_data.ingest_rscb_files(data_dir: Path = PosixPath('.'), two_char_code: str | None = None, parallel: bool = True, max_workers: int | None = None, config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500)) → None[source][source]#

Process the downloaded RCSB files by globs on the data directory.

Parameters:

data_dirPath: The directory where the downloaded files are stored.
two_char_codeOptional[str]: A two character code representing the batch of files to process. If not provided, all files will be processed.
parallelbool: If True, files will be processed in parallel.
max_workersint | None: If specified, limits number of processes to spawn in parallel mode.
configPinderDataGenConfig: Configuration parameters for dataset generation.

Returns:

None

pinder.data.get_data.ingest_mmcif_list(mmcif_list: list[Path], parallel: bool = True, max_workers: int | None = None, config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500), use_cache: bool = True) → None[source][source]#

Process a list of downloaded RCSB mmcif files.

Parameters:

mmcif_listPath: The list of mmcif files to process in a batch.
parallelbool: If True, files will be processed in parallel.
max_workersint | None: If specified, limits number of processes to spawn in parallel mode.
configPinderDataGenConfig: Configuration parameters for dataset generation.

Returns:

None

pinder.data.get_data.generate_bio_assembly(mmcif_filename: Path) → tuple[Any, DataFrame][source][source]#: Generate biological assemblies for the given mmCIF file

pinder.data.get_data.read_mmcif_file(mmcif_filename: Path) → CIFFile[source][source]#: Read a PDBx/mmCIF file.

pinder.data.get_data.convert_category(category: dict[str, ndarray[Any, Any]]) → dict[int, dict[str, Any]][source][source]#: Convert a PDBx/mmCIF category to a dictionary indexed by sequential ids. with keys and values taken from the original value arrays.

pinder.data.get_data.replace_with_nan(value: Any) → Any[source][source]#

pinder.data.get_data.get_mmcif_category(pdbx_file: CIFFile, category_name: str) → dict[int, dict[str, Any]][source][source]#: Get a PDBx/mmCIF category as a dictionary

pinder.data.get_data.infer_uniprot_from_mapping(mapping_df: DataFrame) → str[source][source]#: Assign uniprot based on largest number of residues in mapping (in case of chimera)

pinder.data.get_data.sequence_mapping(pdbx_file: CIFFile, entry_id: str, entity_id: str) → DataFrame[source][source]#: Get sequence mapping from a PDBx/mmCIF file for all chains.

pinder.data.get_data.get_entities(pdbx_file: CIFFile, entry_id: str) → DataFrame | None[source][source]#: Get entities from a PDBx/mmCIF file.

pinder.data.get_data.get_metadata(pdbx_file: CIFFile) → DataFrame | None[source][source]#

Get metadata from a PDBx/mmCIF file.

Beware of special cases, e.g. who would have thought there are entries with multiple methods? https://www.rcsb.org/structure/7a0l

pinder.data.get_data.get_structure_chains(structure: AtomArrayStack | AtomArray) → list[str][source][source]#: Get all chains in a structure ordered by decreasing size (in residues) if size is the same, order by chain id alphabetically (1 11 12 2 3 4 …) the logic is that we want to assign the largest chains as Receptors and smaller chains as Ligands such that in a R::L dimer, R is the largest chain and L is the smallest chain

pinder.data.get_data.get_interacting_chains(structure: AtomArray, entities: DataFrame, contact_threshold: float = 10.0, backbone_only: bool = True) → DataFrame[source][source]#

Identify interacting chains in a structure.

backbone_only:: The method focuses on protein backbone atoms as defined by DockQ atom names (C, CA, N, O). Due to the focus on backbone atoms, only residue-level contact information is returned.

pinder.data.get_data.save(meta: DataFrame, metadata_file: Path) → None[source][source]#

pinder.data.get_data.save_mapping_checkpoint(checkpoint_file: Path) → None[source][source]#

pinder.data.get_data.process_mmcif(mmcif_file: Path, config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500), use_cache: bool = True) → None[source][source]#

Process a single mmCIF file from the next generation PDB archive.

Parameters:

mmcif_file – Path The mmCIF file to be processed.
config – PinderDataGenConfig Configuration parameters for dataset generation.
use_cache –
bool Whether to skip processing if the metadata file exists and status is set to one of the PROCESSED_STATUS_CODES:
- complete
- no metadata
- assembly failed
- entities failed
- non-protein assembly
- too many chains

Returns:

None

Note

Saves metadata as a text file.
Saves biological assembly as an mmCIF file.
Saves interacting chains along with the interface metrics as a text file.
Saves all pairs of interacting chains as PDB files.
Saves residue numbers of interacting chains as a text file along with Uniprot IDs and numbering.

pinder.data.get_dimers module#

pinder.data.get_dimers.primary_dimer_index_from_dimers(dimers: list[Dimer]) → DataFrame[source][source]#

pinder.data.get_dimers.merge_index_and_entities(index_df: DataFrame, pinder_dir: Path) → DataFrame[source][source]#

pinder.data.get_dimers.validate_schemas(df_index: DataFrame, df_metadata: DataFrame) → None[source][source]#

pinder.data.get_dimers.summarize_putative_apo_pred_counts(pinder_dir: Path) → None[source][source]#

pinder.data.get_dimers.merge_metadata(dimers: list[Dimer], pinder_dir: Path) → None[source][source]#

Merge metadata and annotations from dimers into a single csv file

Parameters:

dimerslist[Dimer]: List of Dimer objects to be processed.
pinder_dirPath: Path to the directory where the output csv file will be saved.

Returns:

None

pinder.data.get_dimers.cast_resi_to_valid_str(resi: str | int | float) → str[source][source]#

pinder.data.get_dimers.load_mapping_chains(pqt_file: Path) → dict[str, str | int | None][source][source]#

pinder.data.get_dimers.collate_chain_info(pinder_dir: Path, max_workers: int | None = None, parallel: bool = True) → None[source][source]#

pinder.data.get_dimers.collate_entity_pqts(entry_dirs: list[Path], pinder_dir: Path, use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

pinder.data.get_dimers.populate_predicted(monomer_ids: list[str], pinder_path: Path, alphafold_path: str = 'gs://public-datasets-deepmind-alphafold-v4', google_cloud_project: str = '', use_cache: bool = True) → list[Monomer][source][source]#

Populate AlphaFold2 (aka “predicted”) monomer structures

Parameters:

monomer_idslist[str]: The list of monomer IDs for which to populate the predicted structure.
pinder_pathPath: The path to the Pinder dataset.
alphafold_pathstr, optional: The path to the AlphaFold dataset, by default “gs://public-datasets-deepmind-alphafold-v4”.
google_cloud_projectstr, optional: The Google Cloud project to use, by default “”.
use_cachebool: Whether to skip populating predicted PDBs if they already exist at the destination paths.

Returns:

list[Monomer]: The list of predicted Monomer instances which were successfully downloaded.

pinder.data.get_dimers.get_pdb_entry_dirs(data_dir: Path) → list[Path][source][source]#

pinder.data.get_dimers.get_monomers_from_mapping_pqts(mapping_pqts: list[Path], pinder_dir: Path) → list[Monomer][source][source]#

pinder.data.get_dimers.get_dimers_from_dimer_pdbs(dimer_pdbs: list[Path], pinder_dir: Path, validate_files: bool = True) → tuple[list[Dimer], list[Monomer]][source][source]#

pinder.data.get_dimers.get_af_monomers_from_monomer_ids(monomer_ids: list[str], pinder_dir: Path) → list[Monomer][source][source]#

pinder.data.get_dimers.populate_entries(data_dir: Path, pinder_dir: Path, alphafold_path: str = 'gs://public-datasets-deepmind-alphafold-v4', google_cloud_project: str = '', entry_dirs: list[Path] | None = None, use_cache: bool = True, use_af_cache: bool = True, populate_alphafold: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

Index PINDER dimers

Parameters:

data_dirPath: The directory where the data is stored.
pinder_dirPath: The directory where the PINDER data will be stored.
alphafold_pathstr, optional: The path to the AlphaFold models. Defaults to “gs://public-datasets-deepmind-alphafold-v4”.
google_cloud_projectstr, optional: The name of the Google Cloud project that you have access to. Defaults to “”.
entry_dirslist[Path], optional: Optional subset of PDB entry directories to populate. Will populate all if not provided.
use_cachebool: Whether to skip populating entries if they are already populated.
use_af_cachebool: Whether to skip populating AF2 entries if they are already populated.
populate_alphafoldbool: Whether to populate AF2 entries after RCSB-derived PDBs.
parallelbool: Whether to populate entries in parallel. Note: this part requires more memory than other steps.
max_workersint, optional: Limit number of parallel processes spawned to max_workers.

pinder.data.get_dimers.populate_predicted_from_monomers(data_dir: Path, pinder_dir: Path, alphafold_path: str = 'gs://public-datasets-deepmind-alphafold-v4', google_cloud_project: str = '', entry_dirs: list[Path] | None = None, use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

Populate predicted monomers after monomers and dimers have been populated.

Parameters:

data_dirPath: The directory where the data is stored.
pinder_dirPath: The directory where the PINDER data will be stored.
alphafold_pathstr, optional: The path to the AlphaFold models. Defaults to “gs://public-datasets-deepmind-alphafold-v4”.
google_cloud_projectstr, optional: The name of the Google Cloud project that you have access to. Defaults to “”.
entry_dirslist[Path], optional: Optional subset of PDB entry directories to populate. Will populate all if not provided.
use_cachebool: Whether to skip populating entries if they are already populated.

pinder.data.get_dimers.get_dimers_from_interface_annotations(data_dir: Path, pinder_dir: Path) → DataFrame[source][source]#

pinder.data.get_dimers.get_matching_entry_files(entry_dirs: list[Path], glob_pattern: str, max_workers: int | None = None, parallel: bool = True) → list[Path][source][source]#

Find all files matching a glob pattern in parallel across a list of ingested PDB entry directories.

Parameters:

entry_dirslist[Path]: PDB entry directories to search.
glob_patternstr: The glob expression to use for matching files.
max_workersint, optional: Limit number of parallel processes spawned to max_workers.
parallelbool: Whether to search in parallel.

pinder.data.get_dimers.split_monomer_dimer_mapping_pqts(mapping_files: list[Path]) → tuple[list[Path], list[Path]][source][source]#

Split list of mapping parquet files into true monomers and split dimer monomers.

Parameters:

mapping_fileslist[Path]: List of mapping files with parquet extension.

pinder.data.get_dimers.split_monomer_dimer_pdbs(pdb_files: list[Path]) → tuple[list[Path], list[Path]][source][source]#

Split list of PDB files into true monomer PDBs and dimer + split-dimer PDBs.

Parameters:

pdb_fileslist[Path]: List of pdb files to split into monomers and dimers.

pinder.data.get_dimers.get_monomer_index_from_files(monomer_mappings: list[Path], monomer_pdbs: list[Path]) → DataFrame[source][source]#

Get index of monomers with valid parquet mapping and PDB file pair on disk. The monomer mapping and PDB files do not need to be in a paired order.

Parameters:

monomer_mappingslist[Path]: List of mapping files corresponding to true monomers.
monomer_pdbslist[Path]: List of PDB files corresponding to true monomers.

pinder.data.get_dimers.get_dimer_index_from_files(data_dir: Path, pinder_dir: Path, dimer_mappings: list[Path], dimer_pdbs: list[Path]) → DataFrame[source][source]#

Get index of dimer files with valid parquet mappings and PDB file pairs on disk.

Parameters:

data_dirPath: The directory where the data is stored.
pinder_dirPath: The directory where the PINDER data will be stored.
dimer_mappingslist[Path]: List of mapping files corresponding to split-dimer monomers.
dimer_pdbslist[Path]: List of PDB files corresponding to dimers and split-dimer monomers.

pinder.data.get_dimers.get_populated_entries(data_dir: Path, pinder_dir: Path, alphafold_path: str = 'gs://public-datasets-deepmind-alphafold-v4', google_cloud_project: str = '', entry_dirs: list[Path] | None = None, transient_interface_config: TransientInterfaceConfig = TransientInterfaceConfig(radius=2.3, min_buried_sasa=1000.0, disulfide_bond_distance=2.05, disulfide_bond_distance_tol=0.05, disulfide_bond_dihedral=90.0, disulfide_bond_dihedral_tol=10.0), use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

Index PINDER dimers

Parameters:

data_dirPath: The directory where the data is stored.
pinder_dirPath: The directory where the PINDER data will be stored.
alphafold_pathstr, optional: The path to the AlphaFold models. Defaults to “gs://public-datasets-deepmind-alphafold-v4”.
google_cloud_projectstr, optional: The name of the Google Cloud project that you have access to. Defaults to “”.
entry_dirslist[Path], optional: Optional subset of PDB entry directories to populate. Will populate all if not provided.
transient_interface_configTransientInterfaceConfig: Config object containing parameters used to label potentially transient interfaces.
use_cachebool: Whether to skip populating entries if they are already populated.
parallelbool: Whether to populate entries in parallel. Note: this part requires more memory than other steps.
max_workersint, optional: Limit number of parallel processes spawned to max_workers.

pinder.data.get_dimers.index_dimers(data_dir: Path | str, pinder_dir: Path | str, alphafold_path: str = 'gs://public-datasets-deepmind-alphafold-v4', google_cloud_project: str = '', entry_dirs: list[Path] | None = None, use_cache: bool = True, parallel: bool = True, max_workers: int | None = None, transient_interface_config: TransientInterfaceConfig = TransientInterfaceConfig(radius=2.3, min_buried_sasa=1000.0, disulfide_bond_distance=2.05, disulfide_bond_distance_tol=0.05, disulfide_bond_dihedral=90.0, disulfide_bond_dihedral_tol=10.0)) → None[source][source]#

pinder.data.get_dimers.find_intersection(row: Series) → int[source][source]#

Find number of pinder dimer interface residues that intersect with an ECOD domain.

Applied to each row of the per-chain mappings, where each row contains the residues in the interface, the residues in the structure in our numbering, PDB numbering, and the ECOD domain begin and end residue IDs in PDB numbering.

Each row contains residues in condensed form, with comma separators. It is required to be in the same ordering for each column, such that a mapping can be constructed by splitting on commas. E.g., row.resi_pdb = ‘-1,0,1,2,3’ and row.resi = ‘1,2,3,4,5’.

pinder.data.get_dimers.get_per_chain_ecod_summary(ecod_RL: DataFrame) → DataFrame[source][source]#

Find ECOD annotations corresponding to pinder dimer chains.

Adds comma-separated ECOD domain IDs, names and number of interface residues that intersect with the domain annotation for each matched pinder dimer chain.

pinder.data.get_dimers.add_ecod_to_metadata(pinder_dir: Path | str, use_cache: bool = True) → None[source][source]#

Add ECOD domain overlap with pinder dimers into metadata.

Reads stage 1 metadata.1.csv.gz and writes a new metadata.2.csv.gz. If the output metadata file exists and use_cache is True, the step is skipped.

pinder.data.get_dimers.add_enzyme_classification(pinder_dir: Path | str, use_cache: bool = True) → None[source][source]#

Add enzyme classification numbers and set contains_enzyme based on RCSB EC annotations.

Reads stage 1 index.1.csv.gz and writes a new enzyme_classification_metadata.parquet and sets the contains_enzyme column boolean column in the index based on whether either asym_id in the pinder dimer has an EC number. If the output metadata parquet file exists and use_cache is True, the step is skipped.

pinder.data.get_dimers.add_predicted_monomers_to_index(pinder_dir: Path, use_cache: bool = True) → None[source][source]#

pinder.data.get_dimers.get_dimer_interchain_bond_atom_info(pdb_file: Path, interface_res: dict[str, list[int]], config: TransientInterfaceConfig = TransientInterfaceConfig(radius=2.3, min_buried_sasa=1000.0, disulfide_bond_distance=2.05, disulfide_bond_distance_tol=0.05, disulfide_bond_dihedral=90.0, disulfide_bond_dihedral_tol=10.0)) → dict[str, str | int][source][source]#

pinder.data.get_dimers.label_potential_transient_interfaces(pinder_dir: Path, config: TransientInterfaceConfig = TransientInterfaceConfig(radius=2.3, min_buried_sasa=1000.0, disulfide_bond_distance=2.05, disulfide_bond_distance_tol=0.05, disulfide_bond_dihedral=90.0, disulfide_bond_dihedral_tol=10.0), use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

pinder.data.get_splits module#

pinder.data.get_splits.print_test_meta_details(test_meta: DataFrame) → None[source][source]#: This function will print out the details of the test set that was generated by the get_split_subsets function.

pinder.data.get_splits.get_splits(pinder_dir: Path, config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), use_cache: bool = True) → None[source][source]#

pinder.data.get_splits.rename_peptide_cluster_ids(pindex: DataFrame, config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000)) → DataFrame[source][source]#

pinder.data.get_splits.get_split_subsets(index_path: str | Path, metadata_path: str | Path, test_systems_path: str | Path, availability_index_path: str | Path = 'data/index_with_apo.parquet', test_meta_output_path: str | Path = 'data/test_subset.csv', filtered_pindex_output_path: str | Path = 'data/pindex_checkpoint.3.csv', af2_transitive_hits_path: str | Path = 'data/af2_lddt070_transitive_hits_mapping.csv', config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000)) → DataFrame[source][source]#

Get the split subsets. This function will generate the test set and the filtered pindex based on the test set. The test set will be saved to test_meta_output_path and the filtered pindex will be saved to filtered_pindex_output_path.

Parameters:

test_systems_path (Path) – The path to the test systems.
index_path (Path) – The path to the index.
metadata_path (Path) – The path to the metadata.
availability_index_path (Path) – The path to the availability index.
test_meta_output_path (Path) – The path to save the test meta data.
filtered_pindex_output_path (Path) – The path to save the filtered pindex.
max_depth_2_hits (int) – The maximum depth 2 hits.
max_depth_2_hits_with_comm (int) – The maximum depth 2 hits with comm.
min_depth_2_hits_with_comm (int) – The minimum depth 2 hits with comm.
top_n (int) – The top n.

pinder.data.get_splits.get_test_val_splits(original_index_path: str | Path = 'index_with_apo.parquet', filtered_index_path: str | Path = 'pindex_checkpoint.3.csv', metadata_path: str | Path = 'metadata.2.csv.gz', test_meta_path: str | Path = 'test_subset.csv', deleak_map_path: str | Path = 'transitive_hits_mapping.csv', deleak_mask_outpath: str | Path = 'pindex_checkpoint.4.csv', config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000)) → None[source][source]#

pinder.data.get_splits.get_train_noisy_apo(pinder_dir: Path, index: DataFrame, config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat'))) → DataFrame[source][source]#

pinder.data.get_splits.add_neff_to_index(index: DataFrame) → DataFrame[source][source]#

pinder.data.get_splits.construct_final_index(pinder_dir: Path, apo_config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), use_cache: bool = True, blacklist_invalid_ids: dict[str, str] = {'1hmc__A1_P09603--1hmc__B1_P09603': 'calpha-only dimer', '2xuw__A1_Q72HW2--2xuw__A2_Q72HW2': 'non-biological assembly with borderline prodigy-cryst probability'}) → None[source][source]#

pinder.data.get_test_set module#

pinder.data.get_test_set.create_transformed_holo_monomer(pdb_paths: tuple[Path, Path]) → None[source][source]#

pinder.data.get_test_set.create_normalized_test_monomers(pinder_dir: Path, use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

pinder.data.get_test_set.get_stratified_sample(df: DataFrame, feat: str | list[str], n_samples: int = 20, n_bins: int = 5, random_state: int = 251) → DataFrame[source][source]#

Gets a stratified sample

Outputs a stratified sample of the provided dataframe based on criteria set for binning distribution. Max binning dimension supported is 2D.

Parameters:

dfpd.DataFrame: The DataFrame to sample from.
featUnion[str, List]: The feature to segment bins from (1-D or 2-D)
n_samplesint: The number of samples to sample from df with the same distribution probability
n_binsint: Number of bin segments. Note: Exponential for 2D. eg 25 for 5 bins in 2D.
random_stateint, optional: random seed number

Returns:

pd.DataFrame: DataFrame containing sampled data from the original DataFrame

pinder.data.get_test_set.assign_pinder_s_subset(pinder_dir: Path, index: DataFrame, max_size: int = 250, min_frac_heterodimers: float = 0.75, heterodimer_seq_identity_threshold: float = 0.8) → DataFrame[source][source]#

pinder.data.get_test_set.assign_test_subsets(pinder_dir: Path, use_cache: bool = True) → None[source][source]#

pinder.data.get_test_set.curate_test_split(pinder_dir: Path, use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

pinder.data.get_test_set.extract_sequence(pdb_file: Path) → dict[str, str][source][source]#

pinder.data.get_test_set.construct_sequence_database(pinder_dir: Path, use_cache: bool = True, parallel: bool = True, max_workers: int | None = None) → None[source][source]#

pinder.data.graph_utils module#

pinder.data.graph_utils.get_alignment_graph_with_indices(alignment_pqt: Path) → DiGraph[source][source]#

Convert a foldseek alignment parquet file to a float-weighted monomer similarity graph.

This graph contains an edge between two given monomers (X, Y) if all of the following conditions are met:

X != Y (string equality on “{pdb_id}_{chain}”)

Similarity score is greater than <score_thr>

Similarity score is less than <upper_threshold>

Alignment length is greater than <min_length>

The conditions constitute requirements for a “valid” alignment.

This graph contains exactly one node for a monomer in <alignment_file> IFF the monomer has at least one valid alignment. This means that the output graph does not necessarily contain all monomers.

The edge score between X and Y is defined as the score of one valid alignment between X and Y in the alignment file.

Parameters:

alignment_filePath: Path to a pre-filtered alignment parquet file converted from original alignment format to parquet via foldseek_utils.alignment_to_parquet. Expects specific formatting.

Returns:

nx.DiGraph: Graph containing nodes (monomers) and integer-weighted edges.

Notes

Possible issue arises from the fact that scores are not symmetric, but only one score is kept. Which score this is depends on alignment file order.

pinder.data.graph_utils.sample_pairs_from_clusters(clusters: dict[str, list[str]], length_file: str | None = None) → tuple[set[str], set[str]][source][source]#: Sample pairs from clusters

pinder.data.graph_utils.system_to_pdb_chain_id(system_id: str) → str[source][source]#

Removes the uniprot component of a pinder monomer id

Parameters:

system_id: str: The PINDER System monomer ID string

Returns:

str: The pdb_chain monomer ID, includes trailing chain digits

pinder.data.graph_utils.system_id_to_fsid_pair(system_id: str) → tuple[str, str][source][source]#

Transform a PINDER System ID to a pair of Foldseek chain IDs

Parameters:

system_id: str: The PINDER System ID string

Returns:

tuple[str, str]: The pair of Foldseek chain IDs

pinder.data.graph_utils.system_monomer_to_fsid(system_monomer: str) → str[source][source]#

Transform a PINDER System monomer ID to a Foldseek chain

Parameters:

system_id: str: The PINDER System monomer ID string

Returns:

str: The Foldseek chain ID

pinder.data.graph_utils.interface_monomer_to_system_monomer(interface_monomer: str) → str[source][source]#

Transform an Interface monomer ID to a PINDER System monomer ID

This essentially ammounts to removing “-R” or “-L” from the end

Parameters:

system_id: str: The Interface monomer ID string

Returns:

str: The PINDER System monomer ID string

pinder.data.graph_utils.get_interface_graph(alignment_graph: DiGraph, interfaces: dict[tuple[str, str], list[Interface]], coverage: float = 0.75) → Graph[source][source]#

Create an interface_graph using an alignment graph and interface map, then remove nodes from the interface graph if those nodes are not in a specified nodeset.

Parameters:

alignment_graph: nx.DiGraph: The alignment graph to use as a supergraph for interface graph construction. Nodes are monomer ID strings (foldseek format), edge (A, B) in G if A, B are foldseek-similar.
interfaces: dict[tuple[str, str], alignment_utils.Interface]: Mapping from PINDER dimers (pairs of PINDER monomer ID strings) to Interface objects
coverage: float: Proportion of interface that must be covered by foldseek alignment. Comes from GraphConfig.coverage_threshold.

Returns:

interface_graph: nx.Graph

Subgraph of alignment_graph where:: Edges between A, B if A and B are similar at an interface Nodes are removed if not present in <used_interfaces>

pinder.data.graph_utils.cluster_from_graph(graph: Graph, community: bool = True, weight: str | None = 'weight', seed: int | RandomState | None = None) → list[set[str]][source][source]#

Computes clusters from a given graph using either asynchronous label propagation or connected_components

Note that asynchronous label propagation is a stochastic algorithm and is therefore not guaranteed to return the same results if seed is not specified.

Parameters:

graph: nx.Graph: An arbitrary networkX graph with edge weights <weights>. Nodes of this graph are expected to be monomer ids of type: str
community: bool: If True, use asynchronous label propagation. Else, return connected components
weight: str | None: The edge attribute for nx.Graph inputs representing the weight of an edge. If None, uses 1 for all weights. Used for AsynLPA clustering. Defaults to “weights”.
seed: int | np.random.RandomState | None: The random seed for asynchronous label propagation.

Returns:

clusters: List[Set[str]]: A list of sets of nodes (str) corresponding to the output clusters

pinder.data.graph_utils.get_node_to_cluster_mapping(clusters: list[set[str]]) → dict[str, int][source][source]#

Create a dictionary mapping node IDs to cluster IDs given a list of clusters.

Parameters:

clusters: list[set[str]]: List of clusters, where each cluster is a set of node IDs.

Returns:

dict:: Dictionary mapping node IDs to cluster IDs.

pinder.data.graph_utils.clean_interface_graph(interface_graph: Graph, used_interfaces: set[str]) → Graph[source][source]#

Remove nodes from the interface graph if those nodes are not in a specified nodeset.

Parameters:

interface_graph: nx.Graph: The interface graph constructed from an alignment graph.
used_interfaces: set[str]: Set of unique monomer ID strings (foldseek format) in interfaces.keys()

Returns:

interface_graph: nx.Graph

Subgraph of interface_graph where:: Edges between A, B if A and B are similar at an interface Nodes are removed if not present in <used_interfaces>

pinder.data.graph_utils.load_graph_pickle(pkl_file: Path) → Graph[source][source]#

pinder.data.graph_utils.construct_interface_graph(interface_pkl: Path = PosixPath('interfaces.pkl'), output_dir: Path = PosixPath('/tmp/graphs'), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5)) → None[source][source]#

Filter existing Interface objects on minimum interface length in residues.

Parameters:

foldseek_output: Path: Path to foldseek output folder from the previous step
interface_pkl: Path: Path to the pickle file containing previously extracted interfaces.
output_dir: str: Path to directory in which to store generated graph pickle files.
graph_config: GraphConfig: Config object storing parameters used for constructing graphs.

pinder.data.graph_utils.construct_graph_from_alignment(alignment_file: Path = PosixPath('/tmp/foldseek/foldseek_dbs/alignment.txt'), alignment_type: str = 'foldseek', output_dir: Path = PosixPath('/tmp/graphs'), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5)) → Graph[source][source]#

pinder.data.graph_utils.construct_interface_cleaned_graph(graph_pkl: Path | Graph, interface_pkl: Path, alignment_type: str, graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), use_cache: bool = True) → None[source][source]#

pinder.data.graph_utils.construct_interface_alignment_graph(interface_pkl: Path, alignment_file: Path = PosixPath('/tmp/foldseek/foldseek_dbs/alignment.txt'), alignment_type: str = 'foldseek', output_dir: Path = PosixPath('/tmp/graphs'), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), use_cache: bool = True) → None[source][source]#

class pinder.data.graph_utils.InterfaceGraph(incoming_graph_data=None, multigraph_input=None, **attr)[source][source]#

Bases: MultiDiGraph

to_undirected_class() → Callable[[], MaxGraph][source]#

Returns the class to use for empty undirected copies.

If you subclass the base classes, use this to designate what directed class to use for to_directed() copies.

class pinder.data.graph_utils.MaxGraph(incoming_graph_data: Any | None = None, **attr: Any)[source][source]#

Bases: Graph

default_val = -inf#

agg_field = 'weight'#

add_edges_from(ebunch_to_add: list[tuple[str, str, dict[str, Any]]], **attr: Any) → None[source]#: Add edges, but only if the agg_field is better than previous

add_edge(u_of_edge: str, v_of_edge: str, **attr: Any) → None[source]#: Add an edge between u and v, IFF the agg_field is better than existing.

pinder.data.rcsb_rsync module#

Utilities for interacting with the RCSB nextgen rsync server.

pinder.data.rcsb_rsync.download_rscb_files(data_dir: Path = PosixPath('.'), two_char_code: str | None = None, redirect_stdout: bool = True, retries: int = 5) → None[source][source]#

This function downloads RCSB files using rsync.

Parameters:

two_char_codeOptional[str]: A two character code representing the batch of files to download. If not provided, all files will be downloaded.
data_dirstr: The directory where the downloaded files will be stored.
redirect_stdoutbool: Whether to silence stdout by redirecting to /dev/null. Default is True.

Examples

>>> download_rscb_files('./data', '1a') 
This will download the batch of files represented by the code '1a', such as 31ab, 51ac, etc.

>>> download_rscb_files(data_dir='./data') 
This will download all files.

pinder.data.rcsb_rsync.download_two_char_codes(codes: list[str], data_dir: Path = PosixPath('.'), redirect_stdout: bool = True) → None[source][source]#

This function downloads RCSB files corresponding to a list of two-character codes using rsync. The two character codes map to the second two characters in a PDB ID.

Parameters:

two_char_codeslist[str]: A list of two character code representing the batches of files to download.
data_dirstr: The directory where the downloaded files will be stored.
redirect_stdoutbool: Whether to silence stdout by redirecting to /dev/null. Default is True.

Examples

>>> download_two_char_codes(['1a', '1b'], './data') 
This will download the batch of files represented by the code '1a' and '1b', such as 31ab, 51ac, etc.

pinder.data.rcsb_rsync.get_rsync_directories() → list[str][source][source]#

pinder.data.rcsb_rsync.get_rsync_two_char_pdb_entries(two_char_code: str, retries: int = 3) → list[str][source][source]#

pinder.data.rcsb_rsync.get_all_rsync_entries(two_char_codes: list[str]) → list[str][source][source]#

pinder.data.rcsb_rsync.get_two_char_codes_not_downloaded(data_dir: Path, two_char_codes: list[str]) → list[str][source][source]#

pinder.data.run module#

Examples for running via CLI:

Full pipeline

pinder_data run

Specify a custom pinder root directory/mount point

pinder_data --pinder_mount_point ./pinder-data-pipeline run

Run specific stage (download RCSB files associated with two-character code bo)

pinder_data --two_char_code bo run_stage download_rcsb_files
# OR
pinder_data --t bo run_stage download_rcsb_files

Example PDB ingest data directory structure:

./pinder-data-pipeline/data/bo/pdb_00006boo/
├── 6boo-assembly.cif
├── 6boo-entities.parquet
├── 6boo-interacting_chains.tsv
├── 6boo-metadata.tsv
├── 6boo-pisa-lite-assembly.json
├── 6boo-pisa-lite-interfaces.json
├── 6boo__A1_B0YD89--6boo__C1_B0YD89.pdb
├── 6boo__A1_B0YD89--6boo__C1_B0YD89.tsv
├── 6boo__A1_B0YD89-R.parquet
├── 6boo__A1_B0YD89-R.pdb
├── 6boo__C1_B0YD89-L.parquet
├── 6boo__C1_B0YD89-L.pdb
├── checkpoint-mapping.txt
├── checkpoint-pisa.txt
├── foldseek_contacts
│   └── 2f2691f67bd6fde5ba4ad0152799dc95
│       └── 6boo__A1_B0YD89--6boo__C1_B0YD89.json
└── pdb_00006boo_xyz-enrich.cif.gz

and the resulting PINDER dataset directory structure:

./pinder-data-pipeline/2024-02/
├── apo_metrics
│   ├── pair_eval
│   │   └── metrics_0.parquet
│   ├── scored_noisy_train_apo_pairings.parquet
│   └── two_sided_apo_monomer_metrics.parquet
├── chain_metadata.parquet
├── cluster
│   └── f6e35584321f647887eacb8ee369305f
│       ├── af2_lddt070_test_sys_table.csv
│       ├── af2_lddt070_transitive_hits_mapping.csv
│       ├── af2_lldt070_test_sys_table.csv
│       ├── af2_lldt070_transitive_hits_mapping.csv
│       ├── foldseek_af2_lddt070_leakage_dict.pkl
│       ├── foldseek_af2_lldt070_leakage_dict.pkl
│       ├── foldseek_af2_lldt070_potential_leaks.pkl
│       ├── foldseek_communities.pkl
│       ├── foldseek_components.pkl
│       ├── foldseek_leakage_dict.pkl
│       ├── foldseek_potential_leaks.pkl
│       ├── index.2.csv.gz
│       ├── mmseqs_components.pkl
│       ├── mmseqs_leakage_dict.pkl
│       ├── mmseqs_potential_leaks.pkl
│       ├── pindex_checkpoint.3.csv
│       ├── pindex_checkpoint.4.csv
│       ├── test_subset.csv
│       ├── test_sys_table.csv
│       └── transitive_hits_mapping.csv
├── dimer_ids.parquet
├── ecod_metadata.parquet
├── entity_metadata.parquet
├── enzyme_classification_metadata.parquet
├── external_annotations
│   └── sabdab_summary_all.tsv
├── foldseek  [672 entries exceeds filelimit, not opening dir]
├── graphs
│   └── 52d26a07886d2d2300c364a381680e8b
│       ├── cleaned_foldseek_alignment_graph.pkl
│       ├── cleaned_mmseqs_alignment_graph.pkl
│       ├── foldseek_alignment_graph.pkl
│       ├── min_length_interfaces.pkl
│       └── mmseqs_alignment_graph.pkl
├── ialign_metrics
│   ├── ialign_potential_leaks.parquet
│   ├── ialign_split_similarity_labels.parquet
│   ├── metrics.parquet
│   ├── pindex_checkpoint.5.parquet
│   └── potential_alignment_leaks.parquet
├── index.1.csv.gz
├── index.parquet
├── index_with_apo.parquet
├── index_with_pred.parquet
├── interface_annotations.parquet
├── interfaces.parquet
├── mappings  [1363 entries exceeds filelimit, not opening dir]
├── metadata.1.csv.gz
├── metadata.2.csv.gz
├── metadata.parquet
├── mmseqs2
│   ├── input.fasta
│   └── mmseqs_dbs
│       ├── 00000
│       │   ├── 00000
│       │   │   ├── alignment.txt
│       │   │   └── mmseqs_error.txt
│       │   └── db
│       │       └── input_00000.fasta
│       ├── alignment.parquet
│       ├── alignment.txt
│       └── filtered_alignment.parquet
├── monomer_ids.parquet
├── monomer_predicted_ids.parquet
├── pdbs  [2890 entries exceeds filelimit, not opening dir]
├── putative_apo_monomer_ids.parquet
├── putative_two_sided_apo_pairings.parquet
├── rcsb_annotations
│   ├── annotations  [268 entries exceeds filelimit, not opening dir]
│   ├── annotations_cath.csv.gz
│   ├── annotations_ecod.csv.gz
│   ├── annotations_other.csv.gz
│   ├── annotations_scop.csv.gz
│   ├── enzyme_classification  [268 entries exceeds filelimit, not opening dir]
│   ├── enzyme_classification.csv.gz
│   ├── features  [268 entries exceeds filelimit, not opening dir]
│   ├── features_asa.csv.gz
│   ├── features_binding_site.csv.gz
│   ├── features_cath.csv.gz
│   ├── features_ecod.csv.gz
│   ├── features_occupancy.csv.gz
│   ├── features_other.csv.gz
│   ├── features_outlier.csv.gz
│   ├── features_sabdab.csv.gz
│   ├── features_scop.csv.gz
│   ├── features_unobserved.csv.gz
│   ├── pfam  [268 entries exceeds filelimit, not opening dir]
│   ├── pfam.csv.gz
│   └── query_data  [268 entries exceeds filelimit, not opening dir]
├── sabdab_metadata.parquet
├── scored_apo_pairings.parquet
├── structural_metadata.parquet
├── supplementary_metadata.parquet
└── test_set_pdbs
    ├── 4boq__A1_Q5VVQ6-R.pdb
    ├── 4boq__A2_Q5VVQ6-L.pdb
    ├── 4boz__A1_Q5VVQ6-R.pdb
    ├── 4boz__B1_P0CG48-L.pdb
    ├── 5bot__A1_P45452-R.pdb
    ├── 5bot__B1_P45452-L.pdb
    ├── 8bo1__A1_P68135-L.pdb
    ├── 8bo1__B1_A0A9P1NJI6-R.pdb
    ├── 8bo8__A1_Q04609-R.pdb
    ├── 8bo8__A2_Q04609-L.pdb
    ├── 8bos__A1_P01112-L.pdb
    ├── 8bos__B1_P20936-R.pdb
    ├── 8bou__A1_A0A7G5MNS2-R.pdb
    └── 8bou__B1_A0A7G5MNS2-L.pdb

pinder.data.run.method_main() → None[source][source]#

pinder.data.run.main() → None[source][source]#

pinder.data.system module#

pinder.data.system.get_dev_systems(dev_index: Path, dev_metadata: Path, dataset_path: Path | None = None) → Iterator[PinderSystem][source][source]#

Loads a list of PinderSystem objects from a local (development) Pinder dataset.

Examples

from pinder.core.loader import filters
from pinder.data.system import get_dev_systems

base_filters = [
    filters.FilterByMissingHolo(),
    filters.FilterSubByContacts(min_contacts=5, radius=10.0, calpha_only=True),
    filters.FilterByHoloElongation(max_var_contribution=0.92),
    filters.FilterDetachedHolo(radius=12, max_components=2),
]
sub_filters = [
    filters.FilterSubByAtomTypes(min_atom_types=4),
    filters.FilterByHoloOverlap(min_overlap=5),
    filters.FilterByHoloSeqIdentity(min_sequence_identity=0.8),
    filters.FilterSubLengths(min_length=0, max_length=1000),
    filters.FilterSubRmsds(rmsd_cutoff=7.5),
    filters.FilterByElongation(max_var_contribution=0.92),
    filters.FilterDetachedSub(radius=12, max_components=2),
]
dimers = get_dev_systems(path_to_dev_pinder)

for sub_filter in sub_filters:
    dimers = (dimer for dimer in dimers if sub_filter(dimer))

for base_filter in base_filters:
    dimers = (dimer for dimer in dev_dimers if base_filter(dimer))

Module contents#

Namespace package root for pinder-data.