@@ -15,6 +15,12 @@ const sectionHeaderRegex = /^([A-D])\s*[:.)-]\s*(.+?)\s*$/i;
1515const dataLineRegex = / ^ d a t a \s * : \s * ( .* ) $ / i;
1616const faqHeaderRegex = / ^ B i t G o \s + K e y C a r d \s + F A Q $ / i;
1717
18+ // PDF coordinate tolerance in points. Nodes within this distance on the Y-axis
19+ // are treated as belonging to the same line; nodes further apart are separate lines.
20+ const PDF_LINE_Y_TOLERANCE = 2 ;
21+ // Horizontal gap in points above which a space is inserted between adjacent nodes.
22+ const PDF_NODE_GAP_THRESHOLD = 2 ;
23+
1824function sanitizeText ( input : string ) : string {
1925 return input . replace ( / \s + / g, ' ' ) . trim ( ) ;
2026}
@@ -43,13 +49,22 @@ function isEncryptedWalletPasswordSectionTitle(title: string): boolean {
4349 return title . toLowerCase ( ) . includes ( 'encrypted wallet password' ) ;
4450}
4551
52+ /**
53+ * Reconstructs logical text lines from an unordered set of PDF text nodes.
54+ *
55+ * PDF text extraction returns individual positioned fragments. This function
56+ * sorts them by page then Y-coordinate (top-to-bottom), groups fragments
57+ * within PDF_LINE_Y_TOLERANCE points of each other onto the same line, and
58+ * inserts a space between fragments that are separated by more than
59+ * PDF_NODE_GAP_THRESHOLD points horizontally.
60+ */
4661export function buildLinesFromPDFNodes ( nodes : PDFTextNode [ ] ) : string [ ] {
4762 const sortedNodes = [ ...nodes ] . sort ( ( a , b ) => {
4863 if ( a . page !== b . page ) {
4964 return a . page - b . page ;
5065 }
5166 const yDiff = Math . abs ( a . y - b . y ) ;
52- if ( yDiff > 2 ) {
67+ if ( yDiff > PDF_LINE_Y_TOLERANCE ) {
5368 return b . y - a . y ;
5469 }
5570 return a . x - b . x ;
@@ -74,7 +89,7 @@ export function buildLinesFromPDFNodes(nodes: PDFTextNode[]): string[] {
7489 continue ;
7590 }
7691
77- if ( previousRightEdge !== null && node . x - previousRightEdge > 2 ) {
92+ if ( previousRightEdge !== null && node . x - previousRightEdge > PDF_NODE_GAP_THRESHOLD ) {
7893 line += ' ' ;
7994 }
8095 line += piece ;
@@ -89,7 +104,7 @@ export function buildLinesFromPDFNodes(nodes: PDFTextNode[]): string[] {
89104
90105 for ( const node of sortedNodes ) {
91106 const pageChanged = node . page !== currentPage ;
92- const lineChanged = Number . isNaN ( currentY ) || Math . abs ( node . y - currentY ) > 2 ;
107+ const lineChanged = Number . isNaN ( currentY ) || Math . abs ( node . y - currentY ) > PDF_LINE_Y_TOLERANCE ;
93108 if ( pageChanged || lineChanged ) {
94109 flushLine ( ) ;
95110 currentLineNodes = [ node ] ;
0 commit comments