Skip to content

Commit 1e61e73

Browse files
authored
USE-91-handle-empty-crawls (#51)
Why these changes are being introduced: * Empty crawls were exiting on an error code when this is an expected scenario that should be handled with a clean exit. How this addresses that need: * Add NoValidSeedsError exception * Update harvest CLI command to exit on NoValidSeedsError * Add command_exit to run after all CLI commands to account for updated harvest CLI command * Update _handle_subprocess_logging method to raise NoValidSeedsError exception and add a try/except block for JSON decode errors * Add corresponding CLI and unit tests Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-91
1 parent d186380 commit 1e61e73

File tree

5 files changed

+82
-7
lines changed

5 files changed

+82
-7
lines changed

harvester/cli.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
import os
77
from datetime import UTC, datetime, timedelta
88
from time import perf_counter
9+
from typing import Any
910

1011
import click
1112
import smart_open # type: ignore[import]
1213

1314
from harvester.config import configure_logger, configure_sentry
1415
from harvester.crawl import Crawler
16+
from harvester.exceptions import NoValidSeedsError
1517
from harvester.metadata import CrawlMetadataParser
1618
from harvester.sitemaps import SitemapsParser
1719
from harvester.utils import require_container
@@ -231,7 +233,7 @@ def generate_metadata_records(
231233
)
232234
@click.pass_context
233235
def harvest(
234-
ctx: click.Context,
236+
_ctx: click.Context,
235237
crawl_name: str,
236238
config_yaml_file: str,
237239
sitemaps: tuple[str, ...],
@@ -286,7 +288,11 @@ def harvest(
286288
btrix_args_json=btrix_args_json,
287289
urls_file=urls_file,
288290
)
289-
crawler.crawl()
291+
try:
292+
crawler.crawl()
293+
except NoValidSeedsError as e:
294+
logger.info(e)
295+
return
290296
logger.info("Crawl complete, WACZ archive located at: %s", crawler.wacz_filepath)
291297

292298
# upload WACZ if output file destination provided
@@ -325,6 +331,10 @@ def harvest(
325331
) as urls_out:
326332
urls_out.write(urls_in.read())
327333

334+
335+
@main.result_callback()
336+
@click.pass_context
337+
def command_exit(ctx: click.Context, *_args: Any, **_kwargs: Any) -> None: # noqa: ANN401
328338
logger.info(
329339
"Total elapsed: %s",
330340
str(

harvester/crawl.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88

99
import smart_open # type: ignore[import]
1010

11-
from harvester.exceptions import ConfigYamlError, WaczFileDoesNotExist
11+
from harvester.exceptions import (
12+
ConfigYamlError,
13+
NoValidSeedsError,
14+
WaczFileDoesNotExist,
15+
)
1216
from harvester.utils import require_container
1317

1418
logger = logging.getLogger(__name__)
@@ -101,11 +105,21 @@ def _handle_subprocess_logging(self, process: subprocess.Popen[str]) -> None:
101105
if '"context":"crawlStatus"' in line:
102106
self._log_crawl_count_status(line)
103107

104-
# identify fatal logs and raise Runtime exception
108+
# identify fatal logs and handle appropriately
105109
if '"logLevel":"fatal"' in line:
106-
raise RuntimeError(
107-
f"Fatal log message detected from crawler, exiting: {line}"
108-
)
110+
try:
111+
log_data = json.loads(line)
112+
if "No valid seeds specified" in log_data.get("message", ""):
113+
raise NoValidSeedsError(log_data.get("message", line))
114+
115+
raise RuntimeError(
116+
"Fatal log message detected from crawler, exiting: "
117+
f"{log_data.get('message', line)}"
118+
)
119+
except json.JSONDecodeError as e:
120+
raise RuntimeError(
121+
f"Could not parse fatal log message: {line}"
122+
) from e
109123

110124
if process.stderr: # pragma: no cover
111125
for line in process.stderr:

harvester/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,7 @@ class ConfigYamlError(Exception):
1717

1818
class ContextManagerRequiredError(Exception):
1919
pass
20+
21+
22+
class NoValidSeedsError(Exception):
23+
pass

tests/test_cli.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import smart_open
99

1010
from harvester.cli import main
11+
from harvester.exceptions import NoValidSeedsError
1112

1213

1314
def test_cli_no_command_options(caplog, runner):
@@ -292,3 +293,27 @@ def test_cli_harvest_with_sitemap_urls_output_file(caplog, runner):
292293
)
293294

294295
assert mock_write_urls.call_count == 2
296+
297+
298+
def test_cli_harvest_handles_no_valid_seeds(caplog, runner):
299+
with patch(
300+
"harvester.crawl.Crawler.crawl",
301+
side_effect=NoValidSeedsError(
302+
"No valid seeds specified, aborting crawl. Quitting"
303+
),
304+
):
305+
result = runner.invoke(
306+
main,
307+
[
308+
"--verbose",
309+
"harvest",
310+
"--crawl-name",
311+
"homepage",
312+
"--config-yaml-file",
313+
"tests/fixtures/lib-website-homepage.yaml",
314+
"--wacz-output-file",
315+
"/tmp/homepage.wacz",
316+
],
317+
)
318+
assert result.exit_code == 0
319+
assert "No valid seeds specified, aborting crawl" in caplog.text

tests/test_crawler.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from harvester.crawl import Crawler
1212
from harvester.exceptions import (
1313
ConfigYamlError,
14+
NoValidSeedsError,
1415
RequiresContainerContextError,
1516
WaczFileDoesNotExist,
1617
)
@@ -187,3 +188,24 @@ def test_crawl_fatal_log_raises_runtime_error(create_mocked_crawler):
187188
"subprocess.Popen", return_value=mock_process
188189
):
189190
crawler.crawl()
191+
192+
193+
@pytest.mark.usefixtures("_mock_inside_container")
194+
def test_crawl_no_valid_seeds_raises_exception(create_mocked_crawler):
195+
crawler = create_mocked_crawler()
196+
stdouts = [
197+
(
198+
'{"logLevel":"fatal","message":"No valid seeds specified, '
199+
'aborting crawl. Quitting","context":"general","details":{}}'
200+
)
201+
]
202+
mock_process = MagicMock()
203+
mock_process.__enter__.return_value = mock_process
204+
mock_process.__exit__.return_value = None
205+
mock_process.stdout = iter(stdouts)
206+
mock_process.stderr = iter([])
207+
mock_process.wait.return_value = 0
208+
with pytest.raises(NoValidSeedsError), patch(
209+
"subprocess.Popen", return_value=mock_process
210+
):
211+
crawler.crawl()

0 commit comments

Comments
 (0)