itextsharp: слова разбиваются при разбиении textchunk на слова

Question

itextsharp: слова разбиваются при разбиении textchunk на слова

Я хочу выделить несколько ключевых слов в наборе файлов PDF. Во-первых, мы должны определить отдельные слова и сопоставить их с моими ключевыми словами. Я нашел пример:

class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy
{
    //Hold each coordinate
    public List<RectAndText> myPoints = new List<RectAndText>();

    List<string> topicTerms;
    public MyLocationTextExtractionStrategy(List<string> topicTerms)
    {
        this.topicTerms = topicTerms;
    }

    //Automatically called for each chunk of text in the PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);


        //Get the bounding box for the chunk of text
        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
        var topRight = renderInfo.GetAscentLine().GetEndPoint();

        //Create a rectangle from it
        var rect = new iTextSharp.text.Rectangle(
                                                bottomLeft[Vector.I1],
                                                bottomLeft[Vector.I2],
                                                topRight[Vector.I1],
                                                topRight[Vector.I2]
                                                );

        //Add this to our main collection
        //filter the meaingless words
        string text = renderInfo.GetText();
        this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));

Тем не менее, я обнаружил, что так много слов не работает. Например, "стоп" будет "st" и "op". Есть ли другой способ определить отдельное слово и его позицию?

0

itextsharp words text-chunking

Источник

user5687780 16 дек '15 в 17:30

1 ответ

Другие вопросы по тегам itextsharp words text-chunking

user5687780 18 дек '15 в 20:10 2015-12-18 20:10 · Answer 1 · 2015-12-18 20:10

Если вы хотите собрать отдельные слова и их координацию, лучше переопределить существующее LocationTextExtractionStrategy. Вот мой код:

public virtual String GetResultantText(ITextChunkFilter chunkFilter){
        if (DUMP_STATE) {
            DumpState();
        }

        List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);
        filteredTextChunks.Sort();

        List<RectAndText> tmpList = new List<RectAndText>();

        StringBuilder sb = new StringBuilder();
        TextChunk lastChunk = null;
        foreach (TextChunk chunk in filteredTextChunks) {

            if (lastChunk == null){
                sb.Append(chunk.Text);
                var startLocation = chunk.StartLocation;
                var endLocation = chunk.EndLocation;

                var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]);
                tmpList.Add(new RectAndText(rect, chunk.Text));
            } else {
                if (chunk.SameLine(lastChunk)){
                    // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                    if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
                    {
                        sb.Append(' ');
                        if (tmpList.Count > 0)
                        {
                            mergeAndStoreChunk(tmpList);
                            tmpList.Clear();
                        }

                    }

                    sb.Append(chunk.Text);

                   var startLocation = chunk.StartLocation; 
                    var endLocation = chunk.EndLocation;

                    var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]);
                    ////var topRight = renderInfo.GetAscentLine().GetEndPoint();
                    tmpList.Add(new RectAndText(rect,chunk.Text));

                } else {
                    sb.Append('\n');
                    sb.Append(chunk.Text);

                }
            }
            lastChunk = chunk;
        }

        return sb.ToString();
    }

    private void mergeAndStoreChunk(List<RectAndText> tmpList)
    {
        RectAndText mergedChunk = tmpList[0];
        int tmpListCount = tmpList.Count();
        for (int i = 1; i < tmpListCount; i++)
        {
            RectAndText nowChunk = tmpList[i];
            mergedChunk.Rect.Right = nowChunk.Rect.Right;
            mergedChunk.Text += nowChunk.Text;
        }
        this.myPoints.Add(mergedChunk);
    }

myPoints - это список, который будет возвращать все, что мы хотим.