Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions cdx_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,27 +214,24 @@ def warcer(cmd: Namespace, cmdline: str):
kwargs_writer['size'] = kwargs['size']
del kwargs['size']

writer = cdx_toolkit.warc.get_writer(cmd.prefix, cmd.subprefix, info, **kwargs_writer)

for obj in cdx.iter(cmd.url, **kwargs):
url = obj['url']
if cmd.url_fgrep and cmd.url_fgrep not in url:
LOGGER.debug('not warcing due to fgrep: %s', url)
continue
if cmd.url_fgrepv and cmd.url_fgrepv in url:
LOGGER.debug('not warcing due to fgrepv: %s', url)
continue
timestamp = obj['timestamp']
try:
record = obj.fetch_warc_record()
except RuntimeError: # pragma: no cover
LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp)
continue
if obj.is_revisit():
LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
writer.write_record(record)

writer.close()
with cdx_toolkit.warc.get_writer(cmd.prefix, cmd.subprefix, info, **kwargs_writer) as writer:
for obj in cdx.iter(cmd.url, **kwargs):
url = obj['url']
if cmd.url_fgrep and cmd.url_fgrep not in url:
LOGGER.debug('not warcing due to fgrep: %s', url)
continue
if cmd.url_fgrepv and cmd.url_fgrepv in url:
LOGGER.debug('not warcing due to fgrepv: %s', url)
continue
timestamp = obj['timestamp']
try:
record = obj.fetch_warc_record()
except RuntimeError: # pragma: no cover
LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp)
continue
if obj.is_revisit():
LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
writer.write_record(record)


def sizer(cmd: Namespace, cmdline):
Expand Down
7 changes: 7 additions & 0 deletions cdx_toolkit/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,13 @@ def close(self):
# Close the WARC writer (this must be called at the end)
self._close_current_file()

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False


def get_writer(prefix, subprefix, info, **kwargs):
return CDXToolkitWARCWriter(prefix, subprefix, info, **kwargs)
41 changes: 19 additions & 22 deletions examples/iter-and-warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,22 @@
'format': 'WARC file version 1.0',
}

writer = cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1')

for obj in cdx.iter(url, limit=10):
url = obj['url']
status = obj['status']
timestamp = obj['timestamp']

print('considering extracting url', url, 'timestamp', timestamp)
if status != '200':
print(' skipping because status was {}, not 200'.format(status))
continue

try:
record = obj.fetch_warc_record()
except RuntimeError:
print(' skipping capture for RuntimeError 404: %s %s', url, timestamp)
continue
writer.write_record(record)

print(' wrote', url)

writer.close()
with cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1') as writer:
for obj in cdx.iter(url, limit=10):
url = obj['url']
status = obj['status']
timestamp = obj['timestamp']

print('considering extracting url', url, 'timestamp', timestamp)
if status != '200':
print(' skipping because status was {}, not 200'.format(status))
continue

try:
record = obj.fetch_warc_record()
except RuntimeError:
print(' skipping capture for RuntimeError 404: %s %s', url, timestamp)
continue
writer.write_record(record)

print(' wrote', url)
98 changes: 98 additions & 0 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from argparse import Namespace
from unittest.mock import Mock, patch

import cdx_toolkit.cli
import cdx_toolkit.warc


def _make_fake_record():
resp = Mock()
resp.status_code = 200
resp.reason = 'OK'
resp.headers = {'Content-Type': 'text/html'}
resp.content = b'<html>test</html>'
return cdx_toolkit.warc.fake_wb_warc(
url='http://example.com',
wb_url='https://web.archive.org/web/20240101120000id_/http://example.com',
resp=resp,
capture={'url': 'http://example.com', 'timestamp': '20240101120000', 'status': '200'},
)


def _build_warc_cmd(tmp_path, **overrides):
cmd = Namespace(
prefix=str(tmp_path / 'cli'),
subprefix=None,
url='http://example.com',
url_fgrep=None,
url_fgrepv=None,
creator=None,
operator=None,
)
for key, value in overrides.items():
setattr(cmd, key, value)
return cmd


def _make_capture(is_revisit=False, url='http://example.com'):
capture = Mock()
capture.__getitem__ = lambda self, key: {'url': url, 'timestamp': '20240101120000'}[key]
capture.is_revisit.return_value = is_revisit
capture.fetch_warc_record.return_value = _make_fake_record()
return capture


def test_warcer_logs_revisit(tmp_path, caplog):
"""warcer should log a warning when iterating over a revisit capture."""
cmd = _build_warc_cmd(tmp_path)
fake_cdx = Mock()
fake_cdx.iter.return_value = iter([_make_capture(is_revisit=True)])

with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {})):
with caplog.at_level('WARNING', logger='cdx_toolkit.cli'):
cdx_toolkit.cli.warcer(cmd, cmdline='test')

assert 'revisit record being resolved' in caplog.text
assert list(tmp_path.glob('cli-*.warc.gz'))


def test_warcer_skips_fgrep_and_fgrepv(tmp_path, caplog):
"""warcer should skip URLs not matching --url-fgrep or matching --url-fgrepv."""
cmd = _build_warc_cmd(tmp_path, url_fgrep='wanted', url_fgrepv='forbidden')
captures = [
_make_capture(url='http://example.com/nope'), # fails fgrep
_make_capture(url='http://example.com/wanted/forbidden'), # fails fgrepv
_make_capture(url='http://example.com/wanted/keep'), # kept
]
fake_cdx = Mock()
fake_cdx.iter.return_value = iter(captures)

with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {})):
with caplog.at_level('DEBUG', logger='cdx_toolkit.cli'):
cdx_toolkit.cli.warcer(cmd, cmdline='test')

assert 'not warcing due to fgrep' in caplog.text
assert 'not warcing due to fgrepv' in caplog.text
assert captures[0].fetch_warc_record.call_count == 0
assert captures[1].fetch_warc_record.call_count == 0
assert captures[2].fetch_warc_record.call_count == 1


def test_warcer_passes_size_to_writer(tmp_path):
"""warcer should pop 'size' out of kwargs and forward it to the writer."""
cmd = _build_warc_cmd(tmp_path)
fake_cdx = Mock()
fake_cdx.iter.return_value = iter([])
captured = {}

real_get_writer = cdx_toolkit.warc.get_writer

def spy_get_writer(prefix, subprefix, info, **kwargs):
captured.update(kwargs)
return real_get_writer(prefix, subprefix, info, **kwargs)

with patch.object(cdx_toolkit.cli, 'setup_cdx_fetcher_and_kwargs', return_value=(fake_cdx, {'size': 42})):
with patch.object(cdx_toolkit.warc, 'get_writer', side_effect=spy_get_writer):
cdx_toolkit.cli.warcer(cmd, cmdline='test')

assert captured == {'size': 42}
55 changes: 55 additions & 0 deletions tests/unit/test_warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,58 @@ def test_fetch_warc_record_requires_s3_deps(monkeypatch):

with pytest.raises(RuntimeError, match=r'cdx_toolkit\[s3\]'):
cdx_toolkit.warc.fetch_warc_record(capture, warc_download_prefix='s3://bucket')


def _make_fake_record():
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.reason = 'OK'
mock_resp.headers = {'Content-Type': 'text/html'}
mock_resp.content = b'<html>test</html>'
capture = {'url': 'http://example.com', 'timestamp': '20240101120000', 'status': '200'}
return cdx_toolkit.warc.fake_wb_warc(
url='http://example.com',
wb_url='https://web.archive.org/web/20240101120000id_/http://example.com',
resp=mock_resp,
capture=capture,
)


def test_warc_writer_context_manager_basic(tmp_path):
"""The writer must work as a context manager and flush its file on exit."""
prefix = str(tmp_path / 'ctx')
with cdx_toolkit.warc.get_writer(prefix, None, {'software': 'test'}) as writer:
writer.write_record(_make_fake_record())
filename = writer.filename

assert writer._file_context is None
written = list(tmp_path.glob('ctx-*.warc.gz'))
assert len(written) == 1
assert written[0].stat().st_size > 0
assert written[0].name == filename.rsplit('/', 1)[-1]


def test_warc_writer_context_manager_returns_self(tmp_path):
writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'ret'), None, {'software': 'test'})
with writer as entered:
assert entered is writer


def test_warc_writer_context_manager_propagates_exceptions(tmp_path):
"""Exceptions inside the with block must propagate, and cleanup must still run."""
writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'boom'), None, {'software': 'test'})
with pytest.raises(RuntimeError, match='boom'):
with writer:
writer.write_record(_make_fake_record())
raise RuntimeError('boom')
assert writer._file_context is None
assert writer.file_handler is None


def test_warc_writer_close_idempotent(tmp_path):
"""Calling close() twice (e.g. explicit close then __exit__) must not raise."""
writer = cdx_toolkit.warc.get_writer(str(tmp_path / 'idem'), None, {'software': 'test'})
with writer:
writer.write_record(_make_fake_record())
writer.close()
writer.close()
Loading