pinder.data.pipeline package#
Submodules#
pinder.data.pipeline.cache module#
- pinder.data.pipeline.cache.skip_step(step_name: str, run_specific_step: str = '', skip_specific_step: str = '') bool [source][source]#
- pinder.data.pipeline.cache.get_uningested_mmcif(ingest_cifs: list[Path]) list[Path] [source][source]#
- pinder.data.pipeline.cache.get_pisa_unannotated(ingest_cifs: list[Path], use_checkpoint: bool = True) list[Path] [source][source]#
- pinder.data.pipeline.cache.complete_rcsb_annotation(pdb_id: str, annotation_fp: Path) bool [source][source]#
- pinder.data.pipeline.cache.get_rcsb_unannotated(pdb_ids: list[str], pinder_dir: Path) list[str] [source][source]#
pinder.data.pipeline.constants module#
pinder.data.pipeline.data_pipeline module#
- pinder.data.pipeline.data_pipeline.save_stage_metadata(pinder_mount: Path, step_name: str, version_metadata: dict[str, str]) None [source][source]#
- pinder.data.pipeline.data_pipeline.task_step(step_name: str, save_metadata: bool = False) Callable[[Callable[[...], T]], Callable[[...], T | None]] [source][source]#
- pinder.data.pipeline.data_pipeline.scatter_step(step_name: str) Callable[[Callable[[...], T]], Callable[[...], T | None]] [source][source]#
- class pinder.data.pipeline.data_pipeline.DataIngestPipeline(image: str = 'local', pinder_mount_point: str = '/home/runner/.local/share/pinder', pinder_release: str = '2024-02', ingest_config: PinderDataGenConfig = PinderDataGenConfig(interacting_chains_backbone_only=True, interacting_chains_radius=10.0, connected_component_radius=15.0, max_assembly_chains=500), contact_config: ContactConfig = ContactConfig(heavy_only=True, backbone_only=True, backbone_definition='dockq', radius=10.0, only_unique_resi=True, min_length=3), transient_interface_config: TransientInterfaceConfig = TransientInterfaceConfig(radius=2.3, min_buried_sasa=1000.0, disulfide_bond_distance=2.05, disulfide_bond_distance_tol=0.05, disulfide_bond_dihedral=90.0, disulfide_bond_dihedral_tol=10.0), foldseek_config: FoldseekConfig = FoldseekConfig(sensitivity=11.0, evalue=0.05, score_type='lddt', max_seqs=1000, alignment_type=2, alignment_filename='alignment.txt'), mmseqs_config: MMSeqsConfig = MMSeqsConfig(sensitivity=11.0, evalue=0.05, score_type='pident', min_seq_id=0.2, max_seqs=1000, alignment_filename='alignment.txt'), scatter_config: ScatterConfig = ScatterConfig(two_char_batch_size=2, mmcif_batch_size=250, graphql_batch_size=50000, dimer_batch_size=5000, predicted_batch_size=20000, foldseek_db_size=50000, apo_pairing_id_batch_size=20000), graph_config: GraphConfig = GraphConfig(min_interface_length=7, min_alignment_length=10, score_threshold=0.5, upper_threshold=1.1, mmseqs_score_threshold=30.0, mmseqs_upper_threshold=110.0, coverage_threshold=0.5), cluster_config: ClusterConfig = ClusterConfig(seed=40, canonical_method='foldseek_community', edge_weight='weight', foldseek_cluster_edge_threshold=0.7, foldseek_edge_threshold=0.55, foldseek_af2_difficulty_threshold=0.7, mmseqs_edge_threshold=0.0, resolution_thr=3.5, min_chain_length=40, min_atom_types=3, max_var_thr=0.98, oligomeric_count=2, method='X-RAY DIFFRACTION', interface_atom_gaps_4A=0, prodigy_label='BIO', number_of_components=1, alphafold_cutoff_date='2021-10-01', depth_limit=2, max_node_degree=1000, top_n=1, min_depth_2_hits_with_comm=1, max_depth_2_hits_with_comm=2000, max_depth_2_hits=1000), apo_config: ApoPairingConfig = ApoPairingConfig(apo_chain='A', contact_rad=10.0, backbone_only=False, heavy_only=False, min_atom_types=3, min_residues=5, min_holo_resolved_frac=0.3, align_method='pymol', max_refine_rmsd=10.0, min_aligned_apo_res_frac=0.7, min_seq_identity=0.3, max_interface_miss_frac=0.3, max_frac_monomer_dimer_sequence=0.75, invalid_coverage_upper_bound=2.0, invalid_coverage_lower_bound=0.5, scaled_score_metrics=('I-RMSD', 'refine_rmsd', 'sequence_identity', 'Fnat', 'Fnonnat')), ialign_config: IalignConfig = IalignConfig(rmsd_threshold=5.0, log_pvalue_threshold=-9.0, is_score_threshold=0.3, alignment_printout=0, speed_mode=1, min_residues=5, min_interface=5, distance_cutoff=10.0, output_prefix='output'), two_char_code: str | None = None, run_specific_step: str = '', skip_specific_step: str = '', use_cache: bool = True, google_cloud_project: str = 'vantai-analysis')[source][source]#
Bases:
object
pinder.data.pipeline.scatter module#
- pinder.data.pipeline.scatter.chunk_collection(array: list[Any], batch_size: int | None = None, num_batches: int | None = None) Generator[list[Any], list[Any], None] [source][source]#
- Simple chunking algorithm with two possible constraints:
the maximum number of items within a chunk
the maximum total number of chunks
Note
Does not guarantee even packing across individual chunks.
- Parameters:
array – List the iterable to be chunked
batch_size – Optional[int], default=None the maximum number of items in a given chunk
num_batches – Optional[int], default=None the maximum number of chunks to be produced
- Returns:
Generator[List[Any], List[Any], None]
- Return type:
chunks
- pinder.data.pipeline.scatter.chunk_dict(data: dict[str, str], batch_size: int) Generator[dict[str, str], dict[str, str], None] [source][source]#
- pinder.data.pipeline.scatter.chunk_all_vs_all_indices(array: list[Any], batch_size: int) Generator[tuple[int, int], tuple[int, int], None] [source][source]#
- pinder.data.pipeline.scatter.chunk_dict_with_indices(data: dict[str, str], batch_size: int) Generator[tuple[int, dict[str, str]], tuple[int, dict[str, str]], None] [source][source]#
pinder.data.pipeline.tasks module#
- pinder.data.pipeline.tasks.get_stage_inputs(data_dir: Path, input_type: str) list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame] [source][source]#
- pinder.data.pipeline.tasks.get_cache_delta(cache_func: Callable[[...], list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame]], batches: list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame], **kwargs: str | Path | bool) list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame] [source][source]#
- pinder.data.pipeline.tasks.get_stage_tasks(data_dir: ~pathlib.Path, input_type: str, batch_size: int, cache_func: ~typing.Callable[[...], list[str] | list[~pathlib.Path]] | None = None, cache_kwargs: dict[str, str | ~pathlib.Path | bool] = {}, scatter_method: ~typing.Callable[[...], ~typing.Generator[~typing.Any, ~typing.Any, None]] = <function chunk_collection>, scatter_kwargs: dict[str, str | ~pathlib.Path | bool] = {}) list[list[str] | list[Path] | list[tuple[str, bool]] | list[tuple[Path, Path] | DataFrame]] [source][source]#