commoncrawl · malteos · May 28, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
@@ -214,27 +214,24 @@ def warcer(cmd: Namespace, cmdline: str):
         kwargs_writer['size'] = kwargs['size']
         del kwargs['size']
 
-    writer = cdx_toolkit.warc.get_writer(cmd.prefix, cmd.subprefix, info, **kwargs_writer)
-
-    for obj in cdx.iter(cmd.url, **kwargs):
-        url = obj['url']
-        if cmd.url_fgrep and cmd.url_fgrep not in url:
-            LOGGER.debug('not warcing due to fgrep: %s', url)
-            continue
-        if cmd.url_fgrepv and cmd.url_fgrepv in url:
-            LOGGER.debug('not warcing due to fgrepv: %s', url)
-            continue
-        timestamp = obj['timestamp']
-        try:
-            record = obj.fetch_warc_record()
-        except RuntimeError:  # pragma: no cover
-            LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp)
-            continue
-        if obj.is_revisit():
-            LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
-        writer.write_record(record)
-
-    writer.close()
+    with cdx_toolkit.warc.get_writer(cmd.prefix, cmd.subprefix, info, **kwargs_writer) as writer:
+        for obj in cdx.iter(cmd.url, **kwargs):
+            url = obj['url']
+            if cmd.url_fgrep and cmd.url_fgrep not in url:
+                LOGGER.debug('not warcing due to fgrep: %s', url)
+                continue
+            if cmd.url_fgrepv and cmd.url_fgrepv in url:
+                LOGGER.debug('not warcing due to fgrepv: %s', url)
+                continue
+            timestamp = obj['timestamp']
+            try:
+                record = obj.fetch_warc_record()
+            except RuntimeError:  # pragma: no cover
+                LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp)
+                continue
+            if obj.is_revisit():
+                LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
+            writer.write_record(record)
 
 
 def sizer(cmd: Namespace, cmdline):

diff --git a/cdx_toolkit/warc.py b/cdx_toolkit/warc.py
@@ -266,6 +266,13 @@ def close(self):
         # Close the WARC writer (this must be called at the end)
         self._close_current_file()
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return False
+
 
 def get_writer(prefix, subprefix, info, **kwargs):
     return CDXToolkitWARCWriter(prefix, subprefix, info, **kwargs)
diff --git a/examples/iter-and-warc.py b/examples/iter-and-warc.py
@@ -12,25 +12,22 @@
     'format': 'WARC file version 1.0',
 }
 
-writer = cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1')
-
-for obj in cdx.iter(url, limit=10):
-    url = obj['url']
-    status = obj['status']
-    timestamp = obj['timestamp']
-
-    print('considering extracting url', url, 'timestamp', timestamp)
-    if status != '200':
-        print(' skipping because status was {}, not 200'.format(status))
-        continue
-
-    try:
-        record = obj.fetch_warc_record()
-    except RuntimeError:
-        print(' skipping capture for RuntimeError 404: %s %s', url, timestamp)
-        continue
-    writer.write_record(record)
-
-    print(' wrote', url)
-
-writer.close()
+with cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1') as writer:
+    for obj in cdx.iter(url, limit=10):
+        url = obj['url']
+        status = obj['status']
+        timestamp = obj['timestamp']
+
+        print('considering extracting url', url, 'timestamp', timestamp)
+        if status != '200':
+            print(' skipping because status was {}, not 200'.format(status))
+            continue
+
+        try:
+            record = obj.fetch_warc_record()
+        except RuntimeError:
+            print(' skipping capture for RuntimeError 404: %s %s', url, timestamp)
+            continue
+        writer.write_record(record)
+
+        print(' wrote', url)
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -0,0 +1,98 @@
+from argparse import Namespace
+from unittest.mock import Mock, patch
+
+import cdx_toolkit.cli
+import cdx_toolkit.warc
+
+
+def _make_fake_record():
+    resp = Mock()
+    resp.status_code = 200
+    resp.reason = 'OK'
+    resp.headers = {'Content-Type': 'text/html'}
+    resp.content = b'<html>test</html>'
+    return cdx_toolkit.warc.fake_wb_warc(
+        url='http://example.com',
+        wb_url='https://web.archive.org/web/20240101120000id_/http://example.com',
+        resp=resp,
+        capture={'url': 'http://example.com', 'timestamp': '20240101120000', 'status': '200'},
+    )
+
+
+def _build_warc_cmd(tmp_path, **overrides):
+    cmd = Namespace(
+        prefix=str(tmp_path / 'cli'),
+        subprefix=None,
+        url='http://example.com',
+        url_fgrep=None,
+        url_fgrepv=None,
+        creator=None,
+        operator=None,
+    )
+    for key, value in overrides.items():
+        setattr(cmd, key, value)
+    return cmd
+
+
+def _make_capture(is_revisit=False, url='http://example.com'):
+    capture = Mock()
+    capture.__getitem__ = lambda self, key: {'url': url, 'timestamp': '20240101120000'}[key]
+    capture.is_revisit.return_value = is_revisit
+    capture.fetch_warc_record.return_value = _make_fake_record()
+    return capture
+
+
+def test_warcer_logs_revisit(tmp_path, caplog):
+    """warcer should log a warning when iterating over a revisit capture."""
+    cmd = _build_warc_cmd(tmp_path)
+    fake_cdx = Mock()
+    fake_cdx.iter.return_value = iter([_make_capture(is_revisit=True)])
+
+    with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {})):
+        with caplog.at_level('WARNING', logger='cdx_toolkit.cli'):
+            cdx_toolkit.cli.warcer(cmd, cmdline='test')
+
+    assert 'revisit record being resolved' in caplog.text
+    assert list(tmp_path.glob('cli-*.warc.gz'))
+
+
+def test_warcer_skips_fgrep_and_fgrepv(tmp_path, caplog):
+    """warcer should skip URLs not matching --url-fgrep or matching --url-fgrepv."""
+    cmd = _build_warc_cmd(tmp_path, url_fgrep='wanted', url_fgrepv='forbidden')
+    captures = [
+        _make_capture(url='http://example.com/nope'),         # fails fgrep
+        _make_capture(url='http://example.com/wanted/forbidden'),  # fails fgrepv
+        _make_capture(url='http://example.com/wanted/keep'),  # kept
+    ]
+    fake_cdx = Mock()
+    fake_cdx.iter.return_value = iter(captures)
+
+    with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {})):
+        with caplog.at_level('DEBUG', logger='cdx_toolkit.cli'):
+            cdx_toolkit.cli.warcer(cmd, cmdline='test')
+
+    assert 'not warcing due to fgrep' in caplog.text
+    assert 'not warcing due to fgrepv' in caplog.text
+    assert captures[0].fetch_warc_record.call_count == 0
+    assert captures[1].fetch_warc_record.call_count == 0
+    assert captures[2].fetch_warc_record.call_count == 1
+
+
+def test_warcer_passes_size_to_writer(tmp_path):
+    """warcer should pop 'size' out of kwargs and forward it to the writer."""
+    cmd = _build_warc_cmd(tmp_path)
+    fake_cdx = Mock()
+    fake_cdx.iter.return_value = iter([])
+    captured = {}
+
+    real_get_writer = cdx_toolkit.warc.get_writer
+
+    def spy_get_writer(prefix, subprefix, info, **kwargs):
+        captured.update(kwargs)
+        return real_get_writer(prefix, subprefix, info, **kwargs)
+
+    with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {'size': 42})):
+        with patch.object(cdx_toolkit.warc, 'get_writer', side_effect=spy_get_writer):
+            cdx_toolkit.cli.warcer(cmd, cmdline='test')
+
+    assert captured == {'size': 42}
diff --git a/tests/unit/test_warc.py b/tests/unit/test_warc.py
@@ -276,3 +276,58 @@ def test_fetch_warc_record_requires_s3_deps(monkeypatch):
 
     with pytest.raises(RuntimeError, match=r'cdx_toolkit\[s3\]'):
         cdx_toolkit.warc.fetch_warc_record(capture, warc_download_prefix='s3://bucket')
+
+
+def _make_fake_record():
+    mock_resp = Mock()
+    mock_resp.status_code = 200
+    mock_resp.reason = 'OK'
+    mock_resp.headers = {'Content-Type': 'text/html'}
+    mock_resp.content = b'<html>test</html>'
+    capture = {'url': 'http://example.com', 'timestamp': '20240101120000', 'status': '200'}
+    return cdx_toolkit.warc.fake_wb_warc(
+        url='http://example.com',
+        wb_url='https://web.archive.org/web/20240101120000id_/http://example.com',
+        resp=mock_resp,
+        capture=capture,
+    )
+
+
+def test_warc_writer_context_manager_basic(tmp_path):
+    """The writer must work as a context manager and flush its file on exit."""
+    prefix = str(tmp_path / 'ctx')
+    with cdx_toolkit.warc.get_writer(prefix, None, {'software': 'test'}) as writer:
+        writer.write_record(_make_fake_record())
+        filename = writer.filename
+
+    assert writer._file_context is None
+    written = list(tmp_path.glob('ctx-*.warc.gz'))
+    assert len(written) == 1
+    assert written[0].stat().st_size > 0
+    assert written[0].name == filename.rsplit('/', 1)[-1]
+
+
+def test_warc_writer_context_manager_returns_self(tmp_path):
+    writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'ret'), None, {'software': 'test'})
+    with writer as entered:
+        assert entered is writer
+
+
+def test_warc_writer_context_manager_propagates_exceptions(tmp_path):
+    """Exceptions inside the with block must propagate, and cleanup must still run."""
+    writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'boom'), None, {'software': 'test'})
+    with pytest.raises(RuntimeError, match='boom'):
+        with writer:
+            writer.write_record(_make_fake_record())
+            raise RuntimeError('boom')
+    assert writer._file_context is None
+    assert writer.file_handler is None
+
+
+def test_warc_writer_close_idempotent(tmp_path):
+    """Calling close() twice (e.g. explicit close then __exit__) must not raise."""
+    writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'idem'), None, {'software': 'test'})
+    with writer:
+        writer.write_record(_make_fake_record())
+        writer.close()
+    writer.close()