remove page numbers headers and footers pdf
@Nonnull
public String extract(@Nonnull byte[] bytes) throws Exception {
//open file
Document pdfDocument;
String originalText;
try (InputStream fileInputStream = new ByteArrayInputStream(bytes)) {
PdfContentEditor pce = new PdfContentEditor();
pce.bindPdf(fileInputStream);
pce.deleteStampByIds(new int[] {100, 101}); //delete headers and footers
try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
pce.save(bos);
try (ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray())) {
pdfDocument = new Document(bis);
}
}
// pdfDocument = new Document(fileInputStream);
}
com.aspose.pdf.TextAbsorber textAbsorber = new com.aspose.pdf.TextAbsorber();
// Accept the absorber for all the pages
pdfDocument.getPages().accept(textAbsorber);
// Get the extracted text
originalText = textAbsorber.getText();
// cleanup from BOM symbols
StringUtilities strUtils = new StringUtilities();
originalText = strUtils.removeAllUTF8BOM(originalText);
originalText = new PdfTextNormalizer().normalizePdfText(originalText);
return originalText;
}