1313
1414from galaxy .exceptions import (
1515 AuthenticationRequired ,
16+ MessageException ,
1617 ObjectNotFound ,
1718)
1819from galaxy .files .models import (
@@ -105,6 +106,7 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
105106 - doi:10.70122/FK2/AVNCLL (persistent ID)
106107 - doi:10.70122/FK2/DIG2DG/AVNCLL (persistent ID)
107108 - doi:10.70122/FK2/DIG2DG/id:12345 (database ID)
109+ - doi:10.5072/FK2/doi:10.70122/AVNCLL (persistent ID)
108110 - perma:BSC/3ST00L/id:9056 (database ID)
109111 """
110112 if not source_path .startswith ("/" ):
@@ -125,20 +127,54 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
125127 f"Invalid source path: '{ source_path } '. Expected format: '/<dataset_id>/<file_identifier>'."
126128 )
127129
128- file_id_part = parts [- 1 ]
129- dataset_id = "/" .join (parts [:- 1 ])
130+ dataset_id , file_id_part = self ._split_dataset_and_file_pid (parts )
130131
131132 # The file identifier can be either:
132133 # - A persistent ID suffix (e.g., 'AVNCLL' -> full ID is 'doi:10.70122/FK2/DIG2DG/AVNCLL')
133134 # - A database ID with 'id:' prefix (e.g., 'id:12345' -> file_identifier is 'id:12345')
134135 if file_id_part .startswith ("id:" ):
135136 # Database ID format - keep the 'id:' prefix as the file identifier
136137 file_id = file_id_part
138+ elif re .match (r"^[a-zA-Z][a-zA-Z0-9+.-]*:.*" , file_id_part ):
139+ # Full persistent identifier (e.g. doi:, hdl:, ark:, or custom PID providers).
140+ # Files in Dataverse may have their own independent persistent IDs that are
141+ # not hierarchically related to the dataset persistent ID.
142+ file_id = file_id_part
137143 else :
138- # Persistent ID format - construct full persistent ID
144+ # Dataset-scoped persistent ID suffix - construct full persistent ID
139145 file_id = f"{ dataset_id } /{ file_id_part } "
140146 return ContainerAndFileIdentifier (container_id = dataset_id , file_identifier = file_id )
141147
148+ @staticmethod
149+ def _split_dataset_and_file_pid (parts : list [str ]) -> tuple [str , str ]:
150+ """
151+ Split a Dataverse source path into dataset ID and file identifier parts.
152+
153+ Dataverse file-level persistent IDs may themselves contain slashes and are not
154+ necessarily hierarchically related to the dataset persistent ID. For example:
155+
156+ /doi:10.57745/I8EUTL/doi:10.57745/L7SOAJ
157+
158+ In this case:
159+ dataset_id = doi:10.57745/I8EUTL
160+ file_id = doi:10.57745/L7SOAJ
161+
162+ This helper detects such cases by recognizing URI-scheme prefixes in path segments
163+ and grouping them accordingly.
164+ """
165+ # Default: last segment is the file identifier
166+ file_id_part = parts [- 1 ]
167+ dataset_id = "/" .join (parts [:- 1 ])
168+
169+ # Heuristic: if the penultimate segment starts a URI scheme (e.g. doi:, hdl:, ark:),
170+ # then the file persistent ID spans the last two segments.
171+ pid_scheme_re = re .compile (r"^[a-zA-Z][a-zA-Z0-9+.-]*:" )
172+ if len (parts ) >= 3 and pid_scheme_re .match (parts [- 2 ]):
173+ file_id_part = f"{ parts [- 2 ]} /{ parts [- 1 ]} "
174+ dataset_id = "/" .join (parts [:- 2 ])
175+
176+ return dataset_id , file_id_part
177+
142178 def get_container_id_from_path (self , source_path : str ) -> str :
143179 return self .parse_path (source_path , container_id_only = True ).container_id
144180
@@ -336,14 +372,14 @@ def create_draft_file_container(
336372 collection_payload = self ._prepare_collection_data (title , public_name , user_email )
337373 collection = self ._create_collection (":root" , collection_payload , context )
338374 if not collection or "data" not in collection or "alias" not in collection ["data" ]:
339- raise Exception ("Could not create collection in Dataverse or response has an unexpected format." )
375+ raise MessageException ("Could not create collection in Dataverse or response has an unexpected format." )
340376 collection_alias = collection ["data" ]["alias" ]
341377
342378 # Prepare and create the dataset
343379 dataset_payload = self ._prepare_dataset_data (title , public_name , user_email )
344380 dataset = self ._create_dataset (collection_alias , dataset_payload , context )
345381 if not dataset or "data" not in dataset :
346- raise Exception ("Could not create dataset in Dataverse or response has an unexpected format." )
382+ raise MessageException ("Could not create dataset in Dataverse or response has an unexpected format." )
347383
348384 dataset ["data" ]["name" ] = title
349385 return dataset ["data" ]
@@ -421,14 +457,18 @@ def _download_file(
421457 f"Authentication required to download file from '{ download_file_content_url } '. "
422458 f"Please provide a valid API token in your user preferences."
423459 )
424- # TODO: We can only download files from published datasets for now
425- if e .code in [403 , 404 ]:
460+ if e .code == 403 :
461+ # Permission denied: dataset may be unpublished or user lacks access rights
462+ raise ObjectNotFound (
463+ f"Access forbidden when downloading file from '{ download_file_content_url } '. "
464+ f"You may not have permission to access this file, or the dataset is not published."
465+ )
466+ if e .code == 404 :
426467 raise ObjectNotFound (
427468 f"File not found at '{ download_file_content_url } '. "
428469 f"Please make sure the dataset and file exist and are published."
429470 )
430- else :
431- raise
471+ raise
432472
433473 def _get_datasets_from_response (self , response : dict ) -> list [RemoteDirectory ]:
434474 rval : list [RemoteDirectory ] = []
@@ -494,7 +534,7 @@ def _ensure_response_has_expected_status_code(self, response, expected_status_co
494534 error_message = self ._get_response_error_message (response )
495535 if response .status_code == 403 :
496536 self ._raise_auth_required (error_message )
497- raise Exception (
537+ raise MessageException (
498538 f"Request to { response .url } failed with status code { response .status_code } : { error_message } "
499539 )
500540
0 commit comments