Получение координат строки с использованием ITextExtractionStrategy и LocationTextExtractionStrategy в Itextsharp

Question

Получение координат строки с использованием ITextExtractionStrategy и LocationTextExtractionStrategy в Itextsharp

У меня есть файл PDF, который я читаю в строку, используя ITextExtractionStrategy. Теперь из строки я беру подстроку, как My name is XYZ и нужно получить прямоугольные координаты подстроки из файла PDF, но не в состоянии это сделать. По поиску я узнал, что LocationTextExtractionStrategy но не понимаю, как использовать это, чтобы получить координаты.

Вот код..

ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);

string getcoordinate="My name is XYZ";

Как я могу получить прямоугольную координату этой подстроки, используя ITEXTSHARP..

Пожалуйста помоги.

23

c# itextsharp

Источник

user3664608 28 май '14 в 11:05

5 ответов

Решение

Это старый вопрос, но я оставляю здесь свой ответ, так как я не мог найти правильный ответ в Интернете.

Как показал Крис Хаас, нелегко иметь дело со словами, так как iText имеет дело с кусками. Код, который Крис опубликовал в большинстве моих тестов, был неудачным, потому что слово обычно разбивается на несколько частей (он предупреждает об этом в посте).

Чтобы решить эту проблему, вот стратегия, которую я использовал:

Разделить фрагменты по символам (фактически объекты textrenderinfo на каждый символ)
Группировать персонажей по линии. Это не так просто, так как вам приходится иметь дело с выравниванием фрагментов.
Найдите слово, которое нужно найти для каждой строки

Я оставляю здесь код. Я протестировал его с несколькими документами, и он работает довольно хорошо, но в некоторых сценариях он может потерпеть неудачу, потому что это немного сложно, это преобразование чанка -> слова.

Надеюсь, это кому-нибудь поможет.

  class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy
{
    private List<LocationTextExtractionStrategyEx.ExtendedTextChunk> m_DocChunks = new List<ExtendedTextChunk>();
    private List<LocationTextExtractionStrategyEx.LineInfo> m_LinesTextInfo = new List<LineInfo>();
    public List<SearchResult> m_SearchResultsList = new List<SearchResult>();
    private String m_SearchText;
    public const float PDF_PX_TO_MM = 0.3528f;
    public float m_PageSizeY;


    public LocationTextExtractionStrategyEx(String sSearchText, float fPageSizeY)
        : base()
    {
        this.m_SearchText = sSearchText;
        this.m_PageSizeY = fPageSizeY;
    }

    private void searchText()
    {
        foreach (LineInfo aLineInfo in m_LinesTextInfo)
        {
            int iIndex = aLineInfo.m_Text.IndexOf(m_SearchText);
            if (iIndex != -1)
            {
                TextRenderInfo aFirstLetter = aLineInfo.m_LineCharsList.ElementAt(iIndex);
                SearchResult aSearchResult = new SearchResult(aFirstLetter, m_PageSizeY);
                this.m_SearchResultsList.Add(aSearchResult);
            }
        }
    }

    private void groupChunksbyLine()
    {                     
        LocationTextExtractionStrategyEx.ExtendedTextChunk textChunk1 = null;
        LocationTextExtractionStrategyEx.LineInfo textInfo = null;
        foreach (LocationTextExtractionStrategyEx.ExtendedTextChunk textChunk2 in this.m_DocChunks)
        {
            if (textChunk1 == null)
            {                    
                textInfo = new LocationTextExtractionStrategyEx.LineInfo(textChunk2);
                this.m_LinesTextInfo.Add(textInfo);
            }
            else if (textChunk2.sameLine(textChunk1))
            {                      
                textInfo.appendText(textChunk2);
            }
            else
            {                                        
                textInfo = new LocationTextExtractionStrategyEx.LineInfo(textChunk2);
                this.m_LinesTextInfo.Add(textInfo);
            }
            textChunk1 = textChunk2;
        }
    }

    public override string GetResultantText()
    {
        groupChunksbyLine();
        searchText();
        //In this case the return value is not useful
        return "";
    }

    public override void RenderText(TextRenderInfo renderInfo)
    {
        LineSegment baseline = renderInfo.GetBaseline();
        //Create ExtendedChunk
        ExtendedTextChunk aExtendedChunk = new ExtendedTextChunk(renderInfo.GetText(), baseline.GetStartPoint(), baseline.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetCharacterRenderInfos().ToList());
        this.m_DocChunks.Add(aExtendedChunk);
    }

    public class ExtendedTextChunk
    {
        public string m_text;
        private Vector m_startLocation;
        private Vector m_endLocation;
        private Vector m_orientationVector;
        private int m_orientationMagnitude;
        private int m_distPerpendicular;           
        private float m_charSpaceWidth;           
        public List<TextRenderInfo> m_ChunkChars;


        public ExtendedTextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth,List<TextRenderInfo> chunkChars)
        {
            this.m_text = txt;
            this.m_startLocation = startLoc;
            this.m_endLocation = endLoc;
            this.m_charSpaceWidth = charSpaceWidth;                
            this.m_orientationVector = this.m_endLocation.Subtract(this.m_startLocation).Normalize();
            this.m_orientationMagnitude = (int)(Math.Atan2((double)this.m_orientationVector[1], (double)this.m_orientationVector[0]) * 1000.0);
            this.m_distPerpendicular = (int)this.m_startLocation.Subtract(new Vector(0.0f, 0.0f, 1f)).Cross(this.m_orientationVector)[2];                
            this.m_ChunkChars = chunkChars;

        }


        public bool sameLine(LocationTextExtractionStrategyEx.ExtendedTextChunk textChunkToCompare)
        {
            return this.m_orientationMagnitude == textChunkToCompare.m_orientationMagnitude && this.m_distPerpendicular == textChunkToCompare.m_distPerpendicular;
        }


    }

    public class SearchResult
    {
        public int iPosX;
        public int iPosY;

        public SearchResult(TextRenderInfo aCharcter, float fPageSizeY)
        {
            //Get position of upperLeft coordinate
            Vector vTopLeft = aCharcter.GetAscentLine().GetStartPoint();
            //PosX
            float fPosX = vTopLeft[Vector.I1]; 
            //PosY
            float fPosY = vTopLeft[Vector.I2];
            //Transform to mm and get y from top of page
            iPosX = Convert.ToInt32(fPosX * PDF_PX_TO_MM);
            iPosY = Convert.ToInt32((fPageSizeY - fPosY) * PDF_PX_TO_MM);
        }
    }

    public class LineInfo
    {            
        public string m_Text;
        public List<TextRenderInfo> m_LineCharsList;

        public LineInfo(LocationTextExtractionStrategyEx.ExtendedTextChunk initialTextChunk)
        {                
            this.m_Text = initialTextChunk.m_text;
            this.m_LineCharsList = initialTextChunk.m_ChunkChars;
        }

        public void appendText(LocationTextExtractionStrategyEx.ExtendedTextChunk additionalTextChunk)
        {
            m_LineCharsList.AddRange(additionalTextChunk.m_ChunkChars);
            this.m_Text += additionalTextChunk.m_text;
        }
    }
}

9

Источник

user1658318 08 окт '15 в 11:26

Я знаю, что это действительно старый вопрос, но ниже приведено то, что я в итоге сделал. Просто разместив его здесь в надежде, что это будет полезно для кого-то еще.

Следующий код сообщит вам начальные координаты строки (ей), которая содержит текст для поиска. Не должно быть трудно изменить это, чтобы дать положения слов. Заметка. Я проверял это на itextsharp 5.5.11.0 и не будет работать на некоторых старых версиях

Как уже упоминалось выше, в PDF-файлах нет понятия слов / строк или абзацев. Но я обнаружил, что LocationTextExtractionStrategy очень хорошо разбивает строки и слова. Поэтому мое решение основано на этом.

ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ:

Это решение основано на https://github.com/itext/itextsharp/blob/develop/src/core/iTextSharp/text/pdf/parser/LocationTextExtractionStrategy.cs и этот файл содержит комментарий о предварительном просмотре. Так что это может не сработать в будущем.

В любом случае вот код.

using System.Collections.Generic;
using iTextSharp.text.pdf.parser;

namespace Logic
{
    public class LocationTextExtractionStrategyWithPosition : LocationTextExtractionStrategy
    {
        private readonly List<TextChunk> locationalResult = new List<TextChunk>();

        private readonly ITextChunkLocationStrategy tclStrat;

        public LocationTextExtractionStrategyWithPosition() : this(new TextChunkLocationStrategyDefaultImp()) {
        }

        /**
         * Creates a new text extraction renderer, with a custom strategy for
         * creating new TextChunkLocation objects based on the input of the
         * TextRenderInfo.
         * @param strat the custom strategy
         */
        public LocationTextExtractionStrategyWithPosition(ITextChunkLocationStrategy strat)
        {
            tclStrat = strat;
        }


        private bool StartsWithSpace(string str)
        {
            if (str.Length == 0) return false;
            return str[0] == ' ';
        }


        private bool EndsWithSpace(string str)
        {
            if (str.Length == 0) return false;
            return str[str.Length - 1] == ' ';
        }

        /**
         * Filters the provided list with the provided filter
         * @param textChunks a list of all TextChunks that this strategy found during processing
         * @param filter the filter to apply.  If null, filtering will be skipped.
         * @return the filtered list
         * @since 5.3.3
         */

        private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, ITextChunkFilter filter)
        {
            if (filter == null)
            {
                return textChunks;
            }

            var filtered = new List<TextChunk>();

            foreach (var textChunk in textChunks)
            {
                if (filter.Accept(textChunk))
                {
                    filtered.Add(textChunk);
                }
            }

            return filtered;
        }

        public override void RenderText(TextRenderInfo renderInfo)
        {
            LineSegment segment = renderInfo.GetBaseline();
            if (renderInfo.GetRise() != 0)
            { // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to 
                Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise());
                segment = segment.TransformBy(riseOffsetTransform);
            }
            TextChunk tc = new TextChunk(renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment));
            locationalResult.Add(tc);
        }


        public IList<TextLocation> GetLocations()
        {

            var filteredTextChunks = filterTextChunks(locationalResult, null);
            filteredTextChunks.Sort();

            TextChunk lastChunk = null;

             var textLocations = new List<TextLocation>();

            foreach (var chunk in filteredTextChunks)
            {

                if (lastChunk == null)
                {
                    //initial
                    textLocations.Add(new TextLocation
                    {
                        Text = chunk.Text,
                        X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]),
                        Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1])
                    });

                }
                else
                {
                    if (chunk.SameLine(lastChunk))
                    {
                        var text = "";
                        // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                        if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
                            text += ' ';

                        text += chunk.Text;

                        textLocations[textLocations.Count - 1].Text += text;

                    }
                    else
                    {

                        textLocations.Add(new TextLocation
                        {
                            Text = chunk.Text,
                            X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]),
                            Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1])
                        });
                    }
                }
                lastChunk = chunk;
            }

            //now find the location(s) with the given texts
            return textLocations;

        }

    }

    public class TextLocation
    {
        public float X { get; set; }
        public float Y { get; set; }

        public string Text { get; set; }
    }
}

Как вызвать метод:

        using (var reader = new PdfReader(inputPdf))
            {

                var parser = new PdfReaderContentParser(reader);

                var strategy = parser.ProcessContent(pageNumber, new LocationTextExtractionStrategyWithPosition());

                var res = strategy.GetLocations();

                reader.Close();
             }
                var searchResult = res.Where(p => p.Text.Contains(searchText)).OrderBy(p => p.Y).Reverse().ToList();




inputPdf is a byte[] that has the pdf data

pageNumber is the page where you want to search in

6

Источник

user381407 09 июн '17 в 06:11

@Иван Басарт, спасибо тебе огромное.

вот полный код для тех, кому нужно сэкономить время на коде Ивана.

      using System.Collections.Generic;
using System;

class Program
{

    static void Main()
    {
        //Our test file
        var testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.pdf");
        var searchText = "searchWords";

        using (var reader = new PdfReader(testFile))
        {

            for (int pageNumber = 1; pageNumber <= reader.NumberOfPages; pageNumber++)
            {
                LocationTextExtractionStrategyEx strategy = new LocationTextExtractionStrategyEx(searchText, reader.GetPageSize(1).Height);
                var ex = PdfTextExtractor.GetTextFromPage(reader, pageNumber, strategy);
                foreach (LocationTextExtractionStrategyEx.SearchResult result in strategy.m_SearchResultsList)
                {
                    Console.WriteLine("Found at position: X = {0}, Y = {1}", result.iPosX, result.iPosY);
                }
            }
        }
    }
}



class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy
{
    private List<LocationTextExtractionStrategyEx.ExtendedTextChunk> m_DocChunks = new List<ExtendedTextChunk>();
    private List<LocationTextExtractionStrategyEx.LineInfo> m_LinesTextInfo = new List<LineInfo>();
    public List<SearchResult> m_SearchResultsList = new List<SearchResult>();
    private String m_SearchText;
    public const float PDF_PX_TO_MM = 0.3528f;
    public float m_PageSizeY;


    public LocationTextExtractionStrategyEx(String sSearchText, float fPageSizeY)
        : base()
    {
        this.m_SearchText = sSearchText;
        this.m_PageSizeY = fPageSizeY;
    }

    private void searchText()
    {
        foreach (LineInfo aLineInfo in m_LinesTextInfo)
        {
            int iIndex = aLineInfo.m_Text.IndexOf(m_SearchText);
            if (iIndex != -1)
            {
                TextRenderInfo aFirstLetter = aLineInfo.m_LineCharsList.ElementAt(iIndex);
                SearchResult aSearchResult = new SearchResult(aFirstLetter, m_PageSizeY);
                this.m_SearchResultsList.Add(aSearchResult);
            }
        }
    }

    private void groupChunksbyLine()
    {
        LocationTextExtractionStrategyEx.ExtendedTextChunk textChunk1 = null;
        LocationTextExtractionStrategyEx.LineInfo textInfo = null;
        foreach (LocationTextExtractionStrategyEx.ExtendedTextChunk textChunk2 in this.m_DocChunks)
        {
            if (textChunk1 == null)
            {
                textInfo = new LocationTextExtractionStrategyEx.LineInfo(textChunk2);
                this.m_LinesTextInfo.Add(textInfo);
            }
            else if (textChunk2.sameLine(textChunk1))
            {
                textInfo.appendText(textChunk2);
            }
            else
            {
                textInfo = new LocationTextExtractionStrategyEx.LineInfo(textChunk2);
                this.m_LinesTextInfo.Add(textInfo);
            }
            textChunk1 = textChunk2;
        }
    }

    public override string GetResultantText()
    {
        groupChunksbyLine();
        searchText();
        //In this case the return value is not useful
        return "";
    }

    public override void RenderText(TextRenderInfo renderInfo)
    {
        LineSegment baseline = renderInfo.GetBaseline();
        //Create ExtendedChunk
        ExtendedTextChunk aExtendedChunk = new ExtendedTextChunk(renderInfo.GetText(), baseline.GetStartPoint(), baseline.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetCharacterRenderInfos().ToList());
        this.m_DocChunks.Add(aExtendedChunk);
    }

    public class ExtendedTextChunk
    {
        public string m_text;
        private Vector m_startLocation;
        private Vector m_endLocation;
        private Vector m_orientationVector;
        private int m_orientationMagnitude;
        private int m_distPerpendicular;
        private float m_charSpaceWidth;
        public List<TextRenderInfo> m_ChunkChars;


        public ExtendedTextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, List<TextRenderInfo> chunkChars)
        {
            this.m_text = txt;
            this.m_startLocation = startLoc;
            this.m_endLocation = endLoc;
            this.m_charSpaceWidth = charSpaceWidth;
            this.m_orientationVector = this.m_endLocation.Subtract(this.m_startLocation).Normalize();
            this.m_orientationMagnitude = (int)(Math.Atan2((double)this.m_orientationVector[1], (double)this.m_orientationVector[0]) * 1000.0);
            this.m_distPerpendicular = (int)this.m_startLocation.Subtract(new Vector(0.0f, 0.0f, 1f)).Cross(this.m_orientationVector)[2];
            this.m_ChunkChars = chunkChars;

        }


        public bool sameLine(LocationTextExtractionStrategyEx.ExtendedTextChunk textChunkToCompare)
        {
            return this.m_orientationMagnitude == textChunkToCompare.m_orientationMagnitude && this.m_distPerpendicular == textChunkToCompare.m_distPerpendicular;
        }


    }

    public class SearchResult
    {
        public float iPosX;
        public float iPosY;

        public SearchResult(TextRenderInfo aCharcter, float fPageSizeY)
        {
            //Get position of upperLeft coordinate
            Vector vTopLeft = aCharcter.GetAscentLine().GetStartPoint();
            //PosX
            float fPosX = vTopLeft[Vector.I1];
            //PosY
            float fPosY = vTopLeft[Vector.I2];
            //Transform to mm and get y from top of page
            iPosX = Convert.ToInt32(fPosX * PDF_PX_TO_MM);
            iPosY = Convert.ToInt32((fPageSizeY - fPosY) * PDF_PX_TO_MM);

        }
    }

    public class LineInfo
    {
        public string m_Text;
        public List<TextRenderInfo> m_LineCharsList;

        public LineInfo(LocationTextExtractionStrategyEx.ExtendedTextChunk initialTextChunk)
        {
            this.m_Text = initialTextChunk.m_text;
            this.m_LineCharsList = initialTextChunk.m_ChunkChars;
        }

        public void appendText(LocationTextExtractionStrategyEx.ExtendedTextChunk additionalTextChunk)
        {
            m_LineCharsList.AddRange(additionalTextChunk.m_ChunkChars);
            this.m_Text += additionalTextChunk.m_text;
        }
    }
}

0

Источник

Quoc Anh Nguyen 20 июн '23 в 09:40

Вот как вы используете LocationTextExtractionStrategy в VB.NET.

Определение класса:

      Class TextExtractor
    Inherits LocationTextExtractionStrategy
    Implements iTextSharp.text.pdf.parser.ITextExtractionStrategy
    Public oPoints As IList(Of RectAndText) = New List(Of RectAndText)
    Public Overrides Sub RenderText(renderInfo As TextRenderInfo) 'Implements IRenderListener.RenderText
        MyBase.RenderText(renderInfo)

        Dim bottomLeft As Vector = renderInfo.GetDescentLine().GetStartPoint()
        Dim topRight As Vector = renderInfo.GetAscentLine().GetEndPoint() 'GetBaseline

        Dim rect As Rectangle = New Rectangle(bottomLeft(Vector.I1), bottomLeft(Vector.I2), topRight(Vector.I1), topRight(Vector.I2))
        oPoints.Add(New RectAndText(rect, renderInfo.GetText()))
    End Sub

    Private Function GetLines() As Dictionary(Of Single, ArrayList)
        Dim oLines As New Dictionary(Of Single, ArrayList)
        For Each p As RectAndText In oPoints
            Dim iBottom = p.Rect.Bottom

            If oLines.ContainsKey(iBottom) = False Then
                oLines(iBottom) = New ArrayList()
            End If

            oLines(iBottom).Add(p)
        Next

        Return oLines
    End Function

    Public Function Find(ByVal sFind As String) As iTextSharp.text.Rectangle
        Dim oLines As Dictionary(Of Single, ArrayList) = GetLines()

        For Each oEntry As KeyValuePair(Of Single, ArrayList) In oLines
            'Dim iBottom As Integer = oEntry.Key
            Dim oRectAndTexts As ArrayList = oEntry.Value
            Dim sLine As String = ""
            For Each p As RectAndText In oRectAndTexts
                sLine += p.Text
                If sLine.IndexOf(sFind) <> -1 Then
                    Return p.Rect
                End If
            Next
        Next

        Return Nothing
    End Function

End Class

Public Class RectAndText
    Public Rect As iTextSharp.text.Rectangle
    Public Text As String
    Public Sub New(ByVal rect As iTextSharp.text.Rectangle, ByVal text As String)
        Me.Rect = rect
        Me.Text = text
    End Sub
End Class

Использование (Вставьте поле для подписи справа от найденного текста)

      Sub EncryptPdf(ByVal sInFilePath As String, ByVal sOutFilePath As String)

        Dim oPdfReader As iTextSharp.text.pdf.PdfReader = New iTextSharp.text.pdf.PdfReader(sInFilePath)
        Dim oPdfDoc As New iTextSharp.text.Document()
        Dim oPdfWriter As PdfWriter = PdfWriter.GetInstance(oPdfDoc, New FileStream(sOutFilePath, FileMode.Create))
        'oPdfWriter.SetEncryption(PdfWriter.STRENGTH40BITS, sPassword, sPassword, PdfWriter.AllowCopy)
        oPdfDoc.Open()

        oPdfDoc.SetPageSize(iTextSharp.text.PageSize.LEDGER.Rotate())

        Dim oDirectContent As iTextSharp.text.pdf.PdfContentByte = oPdfWriter.DirectContent
        Dim iNumberOfPages As Integer = oPdfReader.NumberOfPages
        Dim iPage As Integer = 0

        Dim iBottomMargin As Integer = txtBottomMargin.Text '10
        Dim iLeftMargin As Integer = txtLeftMargin.Text '500
        Dim iWidth As Integer = txtWidth.Text '120
        Dim iHeight As Integer = txtHeight.Text '780

        Dim oStrategy As New parser.SimpleTextExtractionStrategy()


        Do While (iPage < iNumberOfPages)
            iPage += 1
            oPdfDoc.SetPageSize(oPdfReader.GetPageSizeWithRotation(iPage))
            oPdfDoc.NewPage()

            Dim oPdfImportedPage As iTextSharp.text.pdf.PdfImportedPage =
            oPdfWriter.GetImportedPage(oPdfReader, iPage)
            Dim iRotation As Integer = oPdfReader.GetPageRotation(iPage)
            If (iRotation = 90) Or (iRotation = 270) Then
                oDirectContent.AddTemplate(oPdfImportedPage, 0, -1.0F, 1.0F,
                 0, 0, oPdfReader.GetPageSizeWithRotation(iPage).Height)
            Else
                oDirectContent.AddTemplate(oPdfImportedPage, 1.0F, 0, 0, 1.0F, 0, 0)
            End If

            'Dim sPageText As String = parser.PdfTextExtractor.GetTextFromPage(oPdfReader, iPage, oStrategy)
            'sPageText = System.Text.Encoding.UTF8.GetString(System.Text.ASCIIEncoding.Convert(System.Text.Encoding.Default, System.Text.Encoding.UTF8, System.Text.Encoding.Default.GetBytes(sPageText)))
            'If txtFind.Text = "" OrElse sPageText.IndexOf(txtFind.Text) <> -1 Then

            Dim oTextExtractor As New TextExtractor()
            PdfTextExtractor.GetTextFromPage(oPdfReader, iPage, oTextExtractor) 'Initialize oTextExtractor

            Dim oRect As iTextSharp.text.Rectangle = oTextExtractor.Find(txtFind.Text)
            If oRect IsNot Nothing Then
                Dim iX As Integer = oRect.Left + oRect.Width + iLeftMargin 'Move right
                Dim iY As Integer = oRect.Bottom - iBottomMargin 'Move down

                Dim field As PdfFormField = PdfFormField.CreateSignature(oPdfWriter)
                field.SetWidget(New Rectangle(iX, iY, iX + iWidth, iY + iHeight), PdfAnnotation.HIGHLIGHT_OUTLINE)
                field.FieldName = "myEmptySignatureField" & iPage
                oPdfWriter.AddAnnotation(field)
            End If

        Loop

        oPdfDoc.Close()

    End Sub

0

Источник

user1781849 20 мар '21 в 06:26

Другие вопросы по тегам c# itextsharp

user231316 28 май '14 в 15:11 2014-05-28 15:11 · Accepted Answer · 2014-05-28 15:11

Вот очень, очень простая версия реализации.

Перед внедрением очень важно знать, что в PDF-файлах отсутствует понятие "слова", "параграфы", "предложения" и т. Д. Кроме того, текст в PDF-файле не обязательно размещается слева направо и сверху вниз, и в нем нет ничего. делать с не-LTR языками. Фраза "Hello World" может быть записана в PDF как:

Draw H at (10, 10)
Draw ell at (20, 10)
Draw rld at (90, 10)
Draw o Wo at (50, 20)

Это также может быть написано как

Draw Hello World at (10,10)

ITextExtractionStrategy Интерфейс, который вам нужно реализовать, имеет метод под названием RenderText это вызывается один раз для каждого фрагмента текста в PDF. Обратите внимание, я сказал "кусок", а не "слово". В первом примере выше метод будет вызываться четыре раза для этих двух слов. Во втором примере он будет вызван один раз для этих двух слов. Это очень важная часть, чтобы понять. В PDF-файлах нет слов, и поэтому в iTextSharp нет слов. "Слово" часть на 100% зависит от вас, чтобы решить.

Кроме того, как я сказал выше, в PDF-файлах нет абзацев. Причиной этого является то, что PDF-файлы не могут переносить текст на новую строку. Каждый раз, когда вы видите что-то похожее на возврат абзаца, вы на самом деле видите новую команду рисования текста, которая имеет другую y координировать как предыдущая строка. Смотрите это для дальнейшего обсуждения.

Код ниже является очень простой реализацией. Для этого я подкласс LocationTextExtractionStrategy который уже реализует ITextExtractionStrategy, На каждый звонок RenderText() Я нахожу прямоугольник текущего чанка (используя здесь код Марка) и сохраняю его на потом. Я использую этот простой вспомогательный класс для хранения этих кусков и прямоугольников:

//Helper class that stores our rectangle and text
public class RectAndText {
    public iTextSharp.text.Rectangle Rect;
    public String Text;
    public RectAndText(iTextSharp.text.Rectangle rect, String text) {
        this.Rect = rect;
        this.Text = text;
    }
}

А вот и подкласс:

public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy {
    //Hold each coordinate
    public List<RectAndText> myPoints = new List<RectAndText>();

    //Automatically called for each chunk of text in the PDF
    public override void RenderText(TextRenderInfo renderInfo) {
        base.RenderText(renderInfo);

        //Get the bounding box for the chunk of text
        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
        var topRight = renderInfo.GetAscentLine().GetEndPoint();

        //Create a rectangle from it
        var rect = new iTextSharp.text.Rectangle(
                                                bottomLeft[Vector.I1],
                                                bottomLeft[Vector.I2],
                                                topRight[Vector.I1],
                                                topRight[Vector.I2]
                                                );

        //Add this to our main collection
        this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));
    }
}

И, наконец, реализация вышеперечисленного:

//Our test file
var testFile = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.pdf");

//Create our test file, nothing special
using (var fs = new FileStream(testFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
    using (var doc = new Document()) {
        using (var writer = PdfWriter.GetInstance(doc, fs)) {
            doc.Open();

            doc.Add(new Paragraph("This is my sample file"));

            doc.Close();
        }
    }
}

//Create an instance of our strategy
var t = new MyLocationTextExtractionStrategy();

//Parse page 1 of the document above
using (var r = new PdfReader(testFile)) {
    var ex = PdfTextExtractor.GetTextFromPage(r, 1, t);
}

//Loop through each chunk found
foreach (var p in t.myPoints) {
    Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom));
}

Я не могу не подчеркнуть, что вышеизложенное не принимает во внимание "слова", это будет ваше дело. TextRenderInfo объект, который передается в RenderText имеет метод, называемый GetCharacterRenderInfos() что вы могли бы использовать, чтобы получить больше информации. Вы также можете использовать GetBaseline() instead ofGetDescentLine()`, если вам нет дела до спусков в шрифте.

РЕДАКТИРОВАТЬ

(У меня был отличный обед, поэтому я чувствую себя немного более полезным.)

Вот обновленная версия MyLocationTextExtractionStrategy это делает то, что говорится в моих комментариях ниже, а именно: для поиска строки требуется поиск в каждом чанке. По всем перечисленным причинам это не будет работать в некоторых / многих / большинстве / всех случаях. Если подстрока существует несколько раз в одном чанке, она также будет возвращать только первый экземпляр. Лигатуры и диакритические знаки также могут связываться с этим.

public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy {
    //Hold each coordinate
    public List<RectAndText> myPoints = new List<RectAndText>();

    //The string that we're searching for
    public String TextToSearchFor { get; set; }

    //How to compare strings
    public System.Globalization.CompareOptions CompareOptions { get; set; }

    public MyLocationTextExtractionStrategy(String textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None) {
        this.TextToSearchFor = textToSearchFor;
        this.CompareOptions = compareOptions;
    }

    //Automatically called for each chunk of text in the PDF
    public override void RenderText(TextRenderInfo renderInfo) {
        base.RenderText(renderInfo);

        //See if the current chunk contains the text
        var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions);

        //If not found bail
        if (startPosition < 0) {
            return;
        }

        //Grab the individual characters
        var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();

        //Grab the first and last character
        var firstChar = chars.First();
        var lastChar = chars.Last();


        //Get the bounding box for the chunk of text
        var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
        var topRight = lastChar.GetAscentLine().GetEndPoint();

        //Create a rectangle from it
        var rect = new iTextSharp.text.Rectangle(
                                                bottomLeft[Vector.I1],
                                                bottomLeft[Vector.I2],
                                                topRight[Vector.I1],
                                                topRight[Vector.I2]
                                                );

        //Add this to our main collection
        this.myPoints.Add(new RectAndText(rect, this.TextToSearchFor));
    }

Вы бы использовали это так же, как и раньше, но теперь конструктор имеет единственный обязательный параметр:

var t = new MyLocationTextExtractionStrategy("sample");