diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py index 79cad66..1e59e04 100644 --- a/cdx_toolkit/cli.py +++ b/cdx_toolkit/cli.py @@ -214,27 +214,24 @@ def warcer(cmd: Namespace, cmdline: str): kwargs_writer['size'] = kwargs['size'] del kwargs['size'] - writer = cdx_toolkit.warc.get_writer(cmd.prefix, cmd.subprefix, info, **kwargs_writer) - - for obj in cdx.iter(cmd.url, **kwargs): - url = obj['url'] - if cmd.url_fgrep and cmd.url_fgrep not in url: - LOGGER.debug('not warcing due to fgrep: %s', url) - continue - if cmd.url_fgrepv and cmd.url_fgrepv in url: - LOGGER.debug('not warcing due to fgrepv: %s', url) - continue - timestamp = obj['timestamp'] - try: - record = obj.fetch_warc_record() - except RuntimeError: # pragma: no cover - LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp) - continue - if obj.is_revisit(): - LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp) - writer.write_record(record) - - writer.close() + with cdx_toolkit.warc.get_writer(cmd.prefix, cmd.subprefix, info, **kwargs_writer) as writer: + for obj in cdx.iter(cmd.url, **kwargs): + url = obj['url'] + if cmd.url_fgrep and cmd.url_fgrep not in url: + LOGGER.debug('not warcing due to fgrep: %s', url) + continue + if cmd.url_fgrepv and cmd.url_fgrepv in url: + LOGGER.debug('not warcing due to fgrepv: %s', url) + continue + timestamp = obj['timestamp'] + try: + record = obj.fetch_warc_record() + except RuntimeError: # pragma: no cover + LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp) + continue + if obj.is_revisit(): + LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp) + writer.write_record(record) def sizer(cmd: Namespace, cmdline): diff --git a/cdx_toolkit/warc.py b/cdx_toolkit/warc.py index 3757ce4..e5e599e 100644 --- a/cdx_toolkit/warc.py +++ b/cdx_toolkit/warc.py @@ -266,6 +266,13 @@ def close(self): # Close the WARC writer (this must be called at the end) self._close_current_file() + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return False + def get_writer(prefix, subprefix, info, **kwargs): return CDXToolkitWARCWriter(prefix, subprefix, info, **kwargs) diff --git a/examples/iter-and-warc.py b/examples/iter-and-warc.py index 61de9c0..1d1b820 100755 --- a/examples/iter-and-warc.py +++ b/examples/iter-and-warc.py @@ -12,25 +12,22 @@ 'format': 'WARC file version 1.0', } -writer = cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1') - -for obj in cdx.iter(url, limit=10): - url = obj['url'] - status = obj['status'] - timestamp = obj['timestamp'] - - print('considering extracting url', url, 'timestamp', timestamp) - if status != '200': - print(' skipping because status was {}, not 200'.format(status)) - continue - - try: - record = obj.fetch_warc_record() - except RuntimeError: - print(' skipping capture for RuntimeError 404: %s %s', url, timestamp) - continue - writer.write_record(record) - - print(' wrote', url) - -writer.close() +with cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1') as writer: + for obj in cdx.iter(url, limit=10): + url = obj['url'] + status = obj['status'] + timestamp = obj['timestamp'] + + print('considering extracting url', url, 'timestamp', timestamp) + if status != '200': + print(' skipping because status was {}, not 200'.format(status)) + continue + + try: + record = obj.fetch_warc_record() + except RuntimeError: + print(' skipping capture for RuntimeError 404: %s %s', url, timestamp) + continue + writer.write_record(record) + + print(' wrote', url) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py new file mode 100644 index 0000000..d8edc08 --- /dev/null +++ b/tests/unit/test_cli.py @@ -0,0 +1,98 @@ +from argparse import Namespace +from unittest.mock import Mock, patch + +import cdx_toolkit.cli +import cdx_toolkit.warc + + +def _make_fake_record(): + resp = Mock() + resp.status_code = 200 + resp.reason = 'OK' + resp.headers = {'Content-Type': 'text/html'} + resp.content = b'test' + return cdx_toolkit.warc.fake_wb_warc( + url='http://example.com', + wb_url='https://web.archive.org/web/20240101120000id_/http://example.com', + resp=resp, + capture={'url': 'http://example.com', 'timestamp': '20240101120000', 'status': '200'}, + ) + + +def _build_warc_cmd(tmp_path, **overrides): + cmd = Namespace( + prefix=str(tmp_path / 'cli'), + subprefix=None, + url='http://example.com', + url_fgrep=None, + url_fgrepv=None, + creator=None, + operator=None, + ) + for key, value in overrides.items(): + setattr(cmd, key, value) + return cmd + + +def _make_capture(is_revisit=False, url='http://example.com'): + capture = Mock() + capture.__getitem__ = lambda self, key: {'url': url, 'timestamp': '20240101120000'}[key] + capture.is_revisit.return_value = is_revisit + capture.fetch_warc_record.return_value = _make_fake_record() + return capture + + +def test_warcer_logs_revisit(tmp_path, caplog): + """warcer should log a warning when iterating over a revisit capture.""" + cmd = _build_warc_cmd(tmp_path) + fake_cdx = Mock() + fake_cdx.iter.return_value = iter([_make_capture(is_revisit=True)]) + + with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {})): + with caplog.at_level('WARNING', logger='cdx_toolkit.cli'): + cdx_toolkit.cli.warcer(cmd, cmdline='test') + + assert 'revisit record being resolved' in caplog.text + assert list(tmp_path.glob('cli-*.warc.gz')) + + +def test_warcer_skips_fgrep_and_fgrepv(tmp_path, caplog): + """warcer should skip URLs not matching --url-fgrep or matching --url-fgrepv.""" + cmd = _build_warc_cmd(tmp_path, url_fgrep='wanted', url_fgrepv='forbidden') + captures = [ + _make_capture(url='http://example.com/nope'), # fails fgrep + _make_capture(url='http://example.com/wanted/forbidden'), # fails fgrepv + _make_capture(url='http://example.com/wanted/keep'), # kept + ] + fake_cdx = Mock() + fake_cdx.iter.return_value = iter(captures) + + with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {})): + with caplog.at_level('DEBUG', logger='cdx_toolkit.cli'): + cdx_toolkit.cli.warcer(cmd, cmdline='test') + + assert 'not warcing due to fgrep' in caplog.text + assert 'not warcing due to fgrepv' in caplog.text + assert captures[0].fetch_warc_record.call_count == 0 + assert captures[1].fetch_warc_record.call_count == 0 + assert captures[2].fetch_warc_record.call_count == 1 + + +def test_warcer_passes_size_to_writer(tmp_path): + """warcer should pop 'size' out of kwargs and forward it to the writer.""" + cmd = _build_warc_cmd(tmp_path) + fake_cdx = Mock() + fake_cdx.iter.return_value = iter([]) + captured = {} + + real_get_writer = cdx_toolkit.warc.get_writer + + def spy_get_writer(prefix, subprefix, info, **kwargs): + captured.update(kwargs) + return real_get_writer(prefix, subprefix, info, **kwargs) + + with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {'size': 42})): + with patch.object(cdx_toolkit.warc, 'get_writer', side_effect=spy_get_writer): + cdx_toolkit.cli.warcer(cmd, cmdline='test') + + assert captured == {'size': 42} diff --git a/tests/unit/test_warc.py b/tests/unit/test_warc.py index 428e132..8a99fa5 100644 --- a/tests/unit/test_warc.py +++ b/tests/unit/test_warc.py @@ -276,3 +276,58 @@ def test_fetch_warc_record_requires_s3_deps(monkeypatch): with pytest.raises(RuntimeError, match=r'cdx_toolkit\[s3\]'): cdx_toolkit.warc.fetch_warc_record(capture, warc_download_prefix='s3://bucket') + + +def _make_fake_record(): + mock_resp = Mock() + mock_resp.status_code = 200 + mock_resp.reason = 'OK' + mock_resp.headers = {'Content-Type': 'text/html'} + mock_resp.content = b'test' + capture = {'url': 'http://example.com', 'timestamp': '20240101120000', 'status': '200'} + return cdx_toolkit.warc.fake_wb_warc( + url='http://example.com', + wb_url='https://web.archive.org/web/20240101120000id_/http://example.com', + resp=mock_resp, + capture=capture, + ) + + +def test_warc_writer_context_manager_basic(tmp_path): + """The writer must work as a context manager and flush its file on exit.""" + prefix = str(tmp_path / 'ctx') + with cdx_toolkit.warc.get_writer(prefix, None, {'software': 'test'}) as writer: + writer.write_record(_make_fake_record()) + filename = writer.filename + + assert writer._file_context is None + written = list(tmp_path.glob('ctx-*.warc.gz')) + assert len(written) == 1 + assert written[0].stat().st_size > 0 + assert written[0].name == filename.rsplit('/', 1)[-1] + + +def test_warc_writer_context_manager_returns_self(tmp_path): + writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'ret'), None, {'software': 'test'}) + with writer as entered: + assert entered is writer + + +def test_warc_writer_context_manager_propagates_exceptions(tmp_path): + """Exceptions inside the with block must propagate, and cleanup must still run.""" + writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'boom'), None, {'software': 'test'}) + with pytest.raises(RuntimeError, match='boom'): + with writer: + writer.write_record(_make_fake_record()) + raise RuntimeError('boom') + assert writer._file_context is None + assert writer.file_handler is None + + +def test_warc_writer_close_idempotent(tmp_path): + """Calling close() twice (e.g. explicit close then __exit__) must not raise.""" + writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'idem'), None, {'software': 'test'}) + with writer: + writer.write_record(_make_fake_record()) + writer.close() + writer.close()