Извлечение <strong> текста с помощью ElementTree
Я пытаюсь запустить следующий код, чтобы извлечь весь текст из файла XML:
пожалуйста, обратите внимание на "word_1_14" - слово word.text не соответствует типу, поэтому не распечатывается... Я обнаружил, что это потому, что текст с сильным тегом делает его невидимым. Вы знаете, как найти слово с сильным тегом и распечатать его?
Эта строка имеет проблему - кажется, что слово word_1_14 не является типом объекта..., что делает невозможным распечатать текст.
In the Python code:
for word in ocr_word:
In the XML file:
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
Мне кажется, что ET.find не может обрабатывать XML-текст, выделенный тэгом STRONG.
Код Python:
##marktag: print the text on top of the image:
#whether it is the area name, or the characters itself
def marktag(xmlObject,draw_img,color,printText,printTag,strongWord=None):
if printText:
if xmlObject.text:
if strongWord:
textInTag = strongWord.text
print('debug strong '+textInTag)
else:
textInTag = xmlObject.text
#debug
print('debug 1:'+textInTag)
draw.text((bbCoord_x0,bbCoord_y0),textInTag,font = fnt, fill = color)
return xmlObject
#processing the image and show it
os.chdir('/home/DocData/PDF_DOC/')
file = '2001ABI-7.png'
XMLfilename = file+'.hocr'
tree = ET.parse(XMLfilename) #2550x3300 pixels
root = tree.getroot()
ocr_carea = root.findall(".//{http://www.w3.org/1999/xhtml}div[@class='ocr_carea']")
img = Image.open('/home/bnpp/DocData/PDF_DOC/'+file)
draw = ImageDraw.Draw(img)
area_color = 255
para_color = 145
line_color = 90
word_color = 40
for area in ocr_carea:
marktag(area,draw,area_color,False,True)
ocr_para = area.findall(".//{http://www.w3.org/1999/xhtml}p[@class='ocr_par']")
for para in ocr_para:
marktag(para,draw,para_color,False,True)
#some word shown under line
ocr_line = para.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocr_line']")
for line in ocr_line:
marktag(line,draw,line_color,False,True)
ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")
for word in ocr_word:
try:
strong_word =word[0].text
except Exception:
marktag(word,draw,word_color,True,False)
break
marktag(word,draw,word_color,False,True,strong_word)
Это XML:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.03' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
<strong>T</strong>
</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
</span>
</body>
</html>
Выход:
bbox 762 112 1394 161
ocr_carea-block_1_1
bbox 762 112 1394 161
ocr_par-par_1_1
bbox 762 112 1394 161; baseline 0 -1
ocr_line-line_1_1
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 1056 112 1394 161; x_wconf 91
debug 1:LOCATION
bbox 1192 182 1818 318
ocr_carea-block_1_2
bbox 1203 205 1611 307
ocr_par-par_1_2
bbox 1373 205 1611 221; baseline 0 -1
ocr_line-line_1_2
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
bbox 1218 264 1581 281; baseline 0.006 -2
ocr_line-line_1_3
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
bbox 1203 292 1276 307; baseline 0 0
ocr_line-line_1_4
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
para_word
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
para_word
bbox 1536 205 1611 221; x_wconf 80
debug 1:ABOVE
para_word
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
para_word
bbox 1493 265 1581 281; x_wconf 85
debug 1:GROUND
para_word
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
bbox 131 211 1057 1378
ocr_carea-block_1_3
bbox 131 211 1057 1378
ocr_par-par_1_3
bbox 1012 211 1028 229; baseline 0 0
ocr_line-line_1_5
bbox 1012 211 1028 229; x_wconf 92
debug 1:L
bbox 1011 236 1027 254; baseline 0 0
ocr_line-line_1_6
bbox 1011 236 1027 254; x_wconf 88
debug 1:A
bbox 1013 261 1027 279; baseline 0 0
ocr_line-line_1_7
bbox 1013 261 1027 279; x_wconf 97
ocrx_word-word_1_10
bbox 1012 286 1020 304; baseline 0 0
ocr_line-line_1_8
bbox 1012 286 1020 304; x_wconf 97
ocrx_word-word_1_11
bbox 1013 311 1027 329; baseline 0 0
ocr_line-line_1_9
bbox 1013 311 1027 329; x_wconf 97
debug 1:T
bbox 1012 335 1027 354; baseline 0 0
ocr_line-line_1_10
bbox 1012 335 1027 354; x_wconf 92
debug 1:U
bbox 621 360 1030 387; baseline 0.002 -7
ocr_line-line_1_11
bbox 621 383 624 387; x_wconf 50
ocrx_word-word_1_14
bbox 761 383 764 387; x_wconf 50
ocrx_word-word_1_15
bbox 849 362 922 381; x_wconf 68
debug 1:Afifine
1 ответ
Вы можете get_children()
и тогда вы можете получить text
от этого дети.
for word in ocr_word:
# get main text as list
text_main = [word.text.strip()]
# get children text as list
text_children = [x.text.strip() for x in word.getchildren()]
# concatenate lists
text = text_main + text_children
# create one string
text = " ".join(text).strip()
# result
print(word.get('id'), text)
Минимальный рабочий пример
data = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.03' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
<strong>T</strong>
</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
</span>
</p>
</div>
</div>
</body>
</html>'''
from xml.etree import ElementTree as ET
tree = ET.fromstring(data)
#root = tree.getroot()
line = tree
ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")
for word in ocr_word:
text_main = [word.text.strip()]
text_children = [x.text.strip() for x in word.getchildren()]
text = text_main + text_children
text = " ".join(text).strip()
print(word.get('id'), text)