I am trying to extract the text and would like to store the correct font and font sizes for each text that is output. I have four test pdf's with which I am testing, three return the value 1 and one has the correct font size. What could be causing the problem, what are the solutions?
Here is my code:
using System.Text;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
namespace PDFTextExtraction
{
class Program
{
static void Main(string[] args)
{
string pdfFilePath = "pathtopdf.pdf";
using (PdfReader pdfReader = new PdfReader(pdfFilePath))
{
using (PdfDocument pdfDocument = new PdfDocument(pdfReader))
{
MyTextEventListener listener = new MyTextEventListener();
for (int pageNum = 1; pageNum <= pdfDocument.GetNumberOfPages(); pageNum++)
{
PdfPage page = pdfDocument.GetPage(pageNum);
PdfCanvasProcessor parser = new PdfCanvasProcessor(listener);
parser.ProcessPageContent(page);
}
Console.WriteLine("Extrahierter Text mit Schriftart und Schriftgröße:");
Console.WriteLine(listener.GetExtractedText());
}
}
}
}
public class MyTextEventListener : IEventListener
{`your text`
private readonly StringBuilder extractedText = new StringBuilder();
public void EventOccurred(IEventData data, EventType type)
{
if (type.Equals(EventType.RENDER_TEXT))
{
TextRenderInfo renderInfo = (TextRenderInfo)data;
string text = renderInfo.GetText();
string font = renderInfo.GetFont().GetFontProgram().ToString();
float fontSize = renderInfo.GetFontSize();
extractedText.AppendLine($"Text: {text} | Schriftart: {font} | Schriftgröße: {fontSize}");
}
}
public ICollection<EventType> GetSupportedEvents()
{
return new List<EventType> { EventType.RENDER_TEXT };
}
public string GetExtractedText()
{
return extractedText.ToString();
}
}
}
I have already tried the following:
float ascent = renderInfo.GetAscentLine().GetStartPoint().Get(1);
float descent = renderInfo.GetDescentLine().GetStartPoint().Get(1);
float fontSize = ascent - descent;
I had a version with iTextSharp 5.5.13 and it worked flawlessly?
I did some research and found out that the Pdf operators are defined as follows:
BT
0 0.6 0.95 0 k
/GS0 gs
/T1_0 1 Tf
0.024 Tc
-0.024 Tw
8 0 0 8 32.3386 171.361 Tm
[(V)43.2(o)2.2(r)-1(n)1.5(ame)24( Na)7.9(c)-1.4(h)-1.1(n)1.4(a)0.5(me)] TJ
ET
The font size is defined here as 1? So the font is also defined? I am not a Pdf specialist, but maybe someone here can help me? The correct font size is 8pt.
if I do the following does it work?
Matrix textMatrix = renderInfo.GetTextMatrix();
float scaleX = textMatrix.Get(Matrix.I11);
float scaleY = textMatrix.Get(Matrix.I22);
float _fontSize = (scaleX + scaleY) / 2;