Извлечение <strong> текста с помощью ElementTree

Я пытаюсь запустить следующий код, чтобы извлечь весь текст из файла XML:

пожалуйста, обратите внимание на "word_1_14" - слово word.text не соответствует типу, поэтому не распечатывается... Я обнаружил, что это потому, что текст с сильным тегом делает его невидимым. Вы знаете, как найти слово с сильным тегом и распечатать его?

Эта строка имеет проблему - кажется, что слово word_1_14 не является типом объекта..., что делает невозможным распечатать текст.

In the Python code:
      for word in ocr_word:
In the XML file:
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>

Мне кажется, что ET.find не может обрабатывать XML-текст, выделенный тэгом STRONG.

Код Python:

##marktag: print the text on top of the image:
#whether it is the area name, or the characters itself
def marktag(xmlObject,draw_img,color,printText,printTag,strongWord=None):
    if printText:
        if xmlObject.text:
            if strongWord:
                textInTag = strongWord.text
                print('debug strong '+textInTag)
            else:
                textInTag = xmlObject.text
            #debug
                print('debug 1:'+textInTag)
            draw.text((bbCoord_x0,bbCoord_y0),textInTag,font = fnt, fill = color)

    return xmlObject

    #processing the image and show it    
    os.chdir('/home/DocData/PDF_DOC/')



file = '2001ABI-7.png'
XMLfilename = file+'.hocr'
tree = ET.parse(XMLfilename) #2550x3300 pixels
root = tree.getroot()
ocr_carea = root.findall(".//{http://www.w3.org/1999/xhtml}div[@class='ocr_carea']")
img = Image.open('/home/bnpp/DocData/PDF_DOC/'+file)
draw = ImageDraw.Draw(img)

area_color = 255
para_color = 145
line_color = 90
word_color = 40
for area in ocr_carea:
    marktag(area,draw,area_color,False,True)


    ocr_para = area.findall(".//{http://www.w3.org/1999/xhtml}p[@class='ocr_par']")
    for para in ocr_para:
        marktag(para,draw,para_color,False,True)
        #some word shown under line
        ocr_line = para.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocr_line']")
        for line in ocr_line:
            marktag(line,draw,line_color,False,True)
            ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")
            for word in ocr_word:
                try:
                    strong_word =word[0].text
                except Exception:
                    marktag(word,draw,word_color,True,False)
                    break
                marktag(word,draw,word_color,False,True,strong_word)

Это XML:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
 xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
  <meta name='ocr-system' content='tesseract 3.03' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
   <div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
    <p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
     <span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
      <span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
      <span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
     </span>
    </p>
   </div>
   <div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
    <p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
     <span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
      <span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
      <span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
     </span>
     <span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
      <span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
      <span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
     </span>
     <span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
      <span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
     </span>
    </p>
   </div>
   <div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
    <p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
     <span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
      <span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
     </span>
     <span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
      <span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
     </span>
     <span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
      <span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
       <strong>T</strong>
      </span>
     </span>
     <span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
      <span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
       <strong>I</strong>
      </span>
     </span>
     <span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
      <span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
     </span>
     <span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
      <span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
     </span>
     <span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
      <span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
       <strong>I</strong>
      </span>
      <span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
       <strong>I</strong>
      </span>
      <span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
      <span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
     </span>
            
 </body>
</html>

Выход:

    bbox 762 112 1394 161
ocr_carea-block_1_1
bbox 762 112 1394 161
ocr_par-par_1_1
bbox 762 112 1394 161; baseline 0 -1
ocr_line-line_1_1
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 1056 112 1394 161; x_wconf 91
debug 1:LOCATION
bbox 1192 182 1818 318
ocr_carea-block_1_2
bbox 1203 205 1611 307
ocr_par-par_1_2
bbox 1373 205 1611 221; baseline 0 -1
ocr_line-line_1_2
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
bbox 1218 264 1581 281; baseline 0.006 -2
ocr_line-line_1_3
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
bbox 1203 292 1276 307; baseline 0 0
ocr_line-line_1_4
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
para_word
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
para_word
bbox 1536 205 1611 221; x_wconf 80
debug 1:ABOVE
para_word
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
para_word
bbox 1493 265 1581 281; x_wconf 85
debug 1:GROUND
para_word
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
bbox 131 211 1057 1378
ocr_carea-block_1_3
bbox 131 211 1057 1378
ocr_par-par_1_3
bbox 1012 211 1028 229; baseline 0 0
ocr_line-line_1_5
bbox 1012 211 1028 229; x_wconf 92
debug 1:L
bbox 1011 236 1027 254; baseline 0 0
ocr_line-line_1_6
bbox 1011 236 1027 254; x_wconf 88
debug 1:A
bbox 1013 261 1027 279; baseline 0 0
ocr_line-line_1_7
bbox 1013 261 1027 279; x_wconf 97
ocrx_word-word_1_10
bbox 1012 286 1020 304; baseline 0 0
ocr_line-line_1_8
bbox 1012 286 1020 304; x_wconf 97
ocrx_word-word_1_11
bbox 1013 311 1027 329; baseline 0 0
ocr_line-line_1_9
bbox 1013 311 1027 329; x_wconf 97
debug 1:T
bbox 1012 335 1027 354; baseline 0 0
ocr_line-line_1_10
bbox 1012 335 1027 354; x_wconf 92
debug 1:U
bbox 621 360 1030 387; baseline 0.002 -7
ocr_line-line_1_11
bbox 621 383 624 387; x_wconf 50
ocrx_word-word_1_14
bbox 761 383 764 387; x_wconf 50
ocrx_word-word_1_15
bbox 849 362 922 381; x_wconf 68
debug 1:Afifine

1 ответ

Вы можете get_children() и тогда вы можете получить text от этого дети.

for word in ocr_word:

    # get main text as list
    text_main = [word.text.strip()]

    # get children text as list
    text_children = [x.text.strip() for x in word.getchildren()]

    # concatenate lists
    text = text_main + text_children

    # create one string
    text = " ".join(text).strip()

    # result
    print(word.get('id'), text)

Минимальный рабочий пример

data = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.03' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
<strong>T</strong>
</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
</span>
</p>
</div>
</div>
</body>
</html>'''

from xml.etree import ElementTree as ET

tree = ET.fromstring(data)
#root = tree.getroot()

line = tree

ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")

for word in ocr_word:
    text_main = [word.text.strip()]
    text_children = [x.text.strip() for x in word.getchildren()]
    text = text_main + text_children
    text = " ".join(text).strip()
    print(word.get('id'), text)
Другие вопросы по тегам