pinder.data.pipeline package#

Submodules#

pinder.data.pipeline.cache module#

pinder.data.pipeline.cache.skip_step(step_name: str, run_specific_step: str = '', skip_specific_step: str = '') → bool[source][source]#

pinder.data.pipeline.cache.get_uningested_mmcif(ingest_cifs: list[Path]) → list[Path][source][source]#

pinder.data.pipeline.cache.get_pisa_unannotated(ingest_cifs: list[Path], use_checkpoint: bool = True) → list[Path][source][source]#

pinder.data.pipeline.cache.complete_rcsb_annotation(pdb_id: str, annotation_fp: Path) → bool[source][source]#

pinder.data.pipeline.cache.get_rcsb_unannotated(pdb_ids: list[str], pinder_dir: Path) → list[str][source][source]#

pinder.data.pipeline.cache.get_unannotated_dimer_pdbs(dimer_pdbs: list[Path], use_cache: bool = True) → list[Path][source][source]#

pinder.data.pipeline.cache.get_dimer_pdbs_missing_foldseek_contacts(dimer_pdbs: list[Path], config_hash: str, use_cache: bool = True) → list[Path][source][source]#

pinder.data.pipeline.constants module#

pinder.data.pipeline.data_pipeline module#

pinder.data.pipeline.data_pipeline.save_stage_metadata(pinder_mount: Path, step_name: str, version_metadata: dict[str, str]) → None[source][source]#

pinder.data.pipeline.data_pipeline.year_month() → str[source][source]#

pinder.data.pipeline.data_pipeline.task_step(step_name: str, save_metadata: bool = False) → Callable[[Callable[[...], T]], Callable[[...], T | None]][source][source]#

pinder.data.pipeline.data_pipeline.scatter_step(step_name: str) → Callable[[Callable[[...], T]], Callable[[...], T | None]][source][source]#

class pinder.data.pipeline.data_pipeline.DataIngestPipeline(image: str = 'local', pinder_mount_point: str = '/home/runner/.local/share/pinder', pinder_release: str = '2024-02', ingest_config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500), contact_config: ContactConfig = ContactConfig(heavy_only=True, backbone_only=True, backbone_definition='dockq', radius=10.0, only_unique_resi=True, min_length=3), transient_interface_config: TransientInterfaceConfig = TransientInterfaceConfig(radius=2.3, min_buried_sasa=1000.0, disulfide_bond_distance=2.05, disulfide_bond_distance_tol=0.05, disulfide_bond_dihedral=90.0, disulfide_bond_dihedral_tol=10.0), foldseek_config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt'), mmseqs_config: MMSeqsConfig = MMSeqsConfig(sensitivity=11.0, evalue=0.05, score_type='pident', min_seq_id=0.2, max_seqs=1000, alignment_filename='alignment.txt'), scatter_config: ScatterConfig = ScatterConfig(two_char_batch_size=2, mmcif_batch_size=250, graphql_batch_size=50000, dimer_batch_size=5000, predicted_batch_size=20000, foldseek_db_size=50000, apo_pairing_id_batch_size=20000), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), cluster_config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), apo_config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), ialign_config: IalignConfig = IalignConfig(rmsd_threshold=5.0, log_pvalue_threshold=-9.0, is_score_threshold=0.3, alignment_printout=0, speed_mode=1, min_residues=5, min_interface=5, distance_cutoff=10.0, output_prefix='output'), two_char_code: str | None = None, run_specific_step: str = '', skip_specific_step: str = '', use_cache: bool = True, google_cloud_project: str = 'vantai-analysis')[source][source]#

Bases: object

run() → None[source]#

run_stage(stage_name: str, specific_method: str | None = None) → None[source]#

generate_download_rcsb_files_tasks() → None[source]#

download_rcsb_files(codes: list[str]) → None[source]#

generate_cif_ingest_tasks() → None[source]#

ingest_rcsb_files(mmcif_files: list[Path]) → None[source]#

generate_pisa_annotation_tasks() → None[source]#

get_pisa_annotations(mmcif_files: list[Path]) → None[source]#

generate_rcsb_annotation_tasks() → None[source]#

get_rcsb_annotations(pdb_ids: list[str]) → None[source]#

join_rcsb_annotations() → None[source]#

generate_dimer_annotation_tasks() → None[source]#

get_dimer_annotations(dimer_pdbs: list[Path]) → None[source]#

collect_dimer_annotations() → None[source]#

generate_foldseek_contacts_tasks() → None[source]#

get_dimer_contacts(dimer_pdbs: list[Path]) → None[source]#

collect_foldseek_contacts() → None[source]#

generate_populate_entry_tasks() → None[source]#

populate_entries(entry_dirs: list[Path]) → None[source]#

generate_populate_predicted_tasks() → None[source]#

populate_predicted(entry_dirs: list[Path]) → None[source]#

index_dimers() → None[source]#

add_predicted_monomers() → None[source]#

get_valid_apo_monomers() → None[source]#

generate_apo_pairing_metric_tasks() → None[source]#

get_apo_pairing_metrics(pairing_batch: tuple[list[str], Path]) → None[source]#

join_apo_pairing_metrics() → None[source]#

add_apo_pairings_to_index() → None[source]#

create_foldseek_dbs() → None[source]#

generate_foldseek_tasks() → None[source]#

run_foldseek(db_indices: tuple[int, int]) → None[source]#

join_foldseek() → None[source]#

create_mmseqs_dbs() → None[source]#

generate_mmseqs_tasks() → None[source]#

run_mmseqs(db_indices: tuple[int, int]) → None[source]#

join_mmseqs() → None[source]#

construct_interface_graph() → None[source]#

construct_foldseek_graph() → None[source]#

construct_mmseqs_graph() → None[source]#

cluster() → None[source]#

generate_find_leakage_tasks() → None[source]#

find_leakage(graph_batch: list[tuple[str, bool]]) → None[source]#

get_transitive_hits() → None[source]#

get_af2_hard_difficulty_transitive_hits() → None[source]#

get_splits() → None[source]#

get_alignment_similarity() → None[source]#

construct_final_index() → None[source]#

get_test_set() → None[source]#

pinder.data.pipeline.scatter module#

pinder.data.pipeline.scatter.chunk_collection(array: list[Any], batch_size: int | None = None, num_batches: int | None = None) → Generator[list[Any], list[Any], None][source][source]#

Simple chunking algorithm with two possible constraints:

the maximum number of items within a chunk
the maximum total number of chunks

Note

Does not guarantee even packing across individual chunks.

Parameters:

array – List the iterable to be chunked
batch_size – Optional[int], default=None the maximum number of items in a given chunk
num_batches – Optional[int], default=None the maximum number of chunks to be produced

Returns:

Generator[List[Any], List[Any], None]

Return type:

chunks

pinder.data.pipeline.scatter.chunk_dict(data: dict[str, str], batch_size: int) → Generator[dict[str, str], dict[str, str], None][source][source]#

pinder.data.pipeline.scatter.chunk_all_vs_all_indices(array: list[Any], batch_size: int) → Generator[tuple[int, int], tuple[int, int], None][source][source]#

pinder.data.pipeline.scatter.chunk_dict_with_indices(data: dict[str, str], batch_size: int) → Generator[tuple[int, dict[str, str]], tuple[int, dict[str, str]], None][source][source]#

pinder.data.pipeline.scatter.chunk_dataframe(data: DataFrame, batch_size: int) → Generator[tuple[int, DataFrame], tuple[int, DataFrame], None][source][source]#

pinder.data.pipeline.scatter.chunk_apo_pairing_ids(data: DataFrame, batch_size: int, pinder_dir: Path) → Generator[tuple[list[str], Path], tuple[list[str], Path], None][source][source]#

pinder.data.pipeline.tasks module#

pinder.data.pipeline.tasks.cif_glob(data_dir: Path) → list[Path][source][source]#

pinder.data.pipeline.tasks.dimer_glob(data_dir: Path) → list[Path][source][source]#

pinder.data.pipeline.tasks.entry_glob(data_dir: Path) → list[Path][source][source]#

pinder.data.pipeline.tasks.two_char_code_rsync(data_dir: Path) → list[str][source][source]#

pinder.data.pipeline.tasks.pdb_id_glob(data_dir: Path) → list[str][source][source]#

pinder.data.pipeline.tasks.foldseek_pdb_glob(data_dir: Path) → list[Path][source][source]#

pinder.data.pipeline.tasks.graph_type_glob(data_dir: Path) → list[tuple[str, bool]][source][source]#

pinder.data.pipeline.tasks.putative_apo_pairings(data_dir: Path) → DataFrame[source][source]#

pinder.data.pipeline.tasks.get_stage_inputs(data_dir: Path, input_type: str) → list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame][source][source]#

pinder.data.pipeline.tasks.get_cache_delta(cache_func: Callable[[...], list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame]], batches: list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame], **kwargs: str | Path | bool) → list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame][source][source]#

pinder.data.pipeline.tasks.get_stage_tasks(data_dir: ~pathlib.Path, input_type: str, batch_size: int, cache_func: ~typing.Callable[[...], list[str] | list[~pathlib.Path]] | None = None, cache_kwargs: dict[str, str | ~pathlib.Path | bool] = {}, scatter_method: ~typing.Callable[[...], ~typing.Generator[~typing.Any, ~typing.Any, None]] = <function chunk_collection>, scatter_kwargs: dict[str, str | ~pathlib.Path | bool] = {}) → list[list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame]][source][source]#

pinder.data.pipeline.tasks.run_task(task_func: Callable[[Any], Any], task_input: dict[str, Path | list[Path] | list[str] | tuple[Path, Path] | tuple[int, int] | tuple[int, dict[str, str]]], iterable_kwarg: str | None = None) → None[source][source]#