Web Scraping для извлечения опций из MCQ
Я пытаюсь отменить http://www.geekmcq.com/verbal-ability/analogies/ чтобы извлечь варианты из MCQ. Я делаю это для того, чтобы распечатать вопросы, которые не подходят для печати на веб-сайте. Я извлек вопрос и ответы, используя nodejs с плагином стиля jquery cheerio
но проблема в извлечении опций. Как я могу извлечь их с помощью jquery? Я буду использовать этот синтаксис с добавлением nodejs.
Вот скриншот одного макка
HTML для одного mcq, как показано на скриншоте выше
<tbody>
<tr>
<td class="bix-td-qno" rowspan="2" align="left" valign="top">1. </td>
<td class="bix-td-qtxt" valign="top">BIRD : NEST</td>
</tr>
<tr>
<td class="bix-td-miscell" valign="top">
<table class="bix-tbl-options" id="tblOption_3530" border="0" cellpadding="0" cellspacing="0" width="100%">
<tbody>
<tr>
<td class="bix-td-option" id="tdOptionNo_A_3530" width="1%"><a id="lnkOptionLink_A_3530" href="javascript: void 0;">A.</a></td>
<td class="bix-td-option" id="tdOptionDt_A_3530" width="49%">horse : farm</td>
<td class="bix-td-option" id="tdOptionNo_B_3530" width="1%"><a id="lnkOptionLink_B_3530" href="javascript: void 0;">B.</a></td>
<td class="bix-td-option" id="tdOptionDt_B_3530" width="49%">squirrel : tree</td>
</tr>
<tr>
<td class="bix-td-option" id="tdOptionNo_C_3530" width="1%"><a id="lnkOptionLink_C_3530" href="javascript: void 0;">C.</a></td>
<td class="bix-td-option" id="tdOptionDt_C_3530" width="49%">beaver : dam</td>
<td class="bix-td-option" id="tdOptionNo_D_3530" width="1%"><a id="lnkOptionLink_D_3530" href="javascript: void 0;">D.</a></td>
<td class="bix-td-option" id="tdOptionDt_D_3530" width="49%">cat : kitchen</td>
</tr>
<tr>
<td class="bix-td-option" id="tdOptionNo_E_3530" width="1%"><a id="lnkOptionLink_E_3530" href="javascript: void 0;">E.</a></td>
<td class="bix-td-option" id="tdOptionDt_E_3530" width="49%">book : library</td>
<td class="bix-td-option" id="tdOptionNo_D_3530" width="1%"><strong class="ib-gray"> </strong> <a id="lnkOptionLink_D_3530" href="javascript: void 0;"></a> </td>
<td class="bix-td-option" id="tdOptionDt_D_3530" width="49%"> </td>
</tr>
</tbody>
</table>
<input class="jq-hdnakq" id="hdnAnswer_3530" value="C" type="hidden">
<div class="bix-div-answer" style="display:none" id="divAnswer_3530">
<div class="title-bar"><a href="javascript: void 0;" onclick="$('#divAnswer_3530').slideToggle('slow');">Answer & Explanation</a></div>
<div class="div-spacer">
<p><span class="ib-green"><strong>Answer:</strong></span> Option <strong class="jq-hdnakqb">C</strong></p>
<p><span class="ib-green"><strong>Explanation:</strong></span></p>
<p> Bird makes nest as beaver makes dam. </p>
</div>
</div>
<div class="bix-div-workspace" style="display:none" id="divWorkspace_3530">
<div class="title-bar"><a href="javascript: void 0;" onclick="$('#divWorkspace_3530').slideToggle('slow');">Workspace</a></div>
<div class="div-spacer">
<div class="div-calc" align="right">
<input value="Calc" size="15" class="calc-box hasCalculator" type="text"><img class="calculator-trigger" title="Open the calculator" alt="Open the calculator" src="http://www.geekmcq.com/images/calculator.png">
</div>
<textarea rows="10" cols="65" style="width:100%; padding: 10px;"></textarea>
</div>
</div>
<div class="bix-div-report" style="display:none" id="divReport_3530">
<div class="title-bar"><a href="javascript: void 0;" onclick="$('#divReport_3530').slideToggle('slow');">Report Errors</a></div>
<div class="div-spacer">
<textarea onchange="javascript:if(this.name == '')this.name='txtReport_3530';" name="" id="txtReport_3530" rows="8" cols="65" style="width:100%;padding:10px;">Kindly mention the details of the error here...
[Your Name]
[Your Email]</textarea>
<input id="btnReport_3530" onclick="SendBixReport('1', '3530')" value="Send Report" style="margin-top:2px" type="button">
</div>
</div>
<div class="bix-div-toolbar" id="divToolBar_3530">
<a class="answer" href="javascript: void 0;" onclick="$('#divAnswer_3530').slideToggle('slow')">View Answer</a>
<a class="workspace" href="javascript: void 0;" onclick="$('#divWorkspace_3530').slideToggle('slow')">Workspace</a>
<a class="report" href="javascript: void 0;" onclick="$('#divReport_3530').slideToggle('slow')">Report</a>
<a class="discuss" href="http://www.geekmcq.com/verbal-ability/analogies/discussion-3530">Discuss in Forum</a>
</div>
</td>
</tr>
1 ответ
Решение
Возьми этот код человек (:
var
cheerio = require('cheerio'),
request = require('request');
function getHTML(url, callback) {
request.get(url, function(err, response, body) {
if(err) {
console.error(err);
return callback();
}
callback(body);
});
}
var url = 'http://www.geekmcq.com/verbal-ability/analogies/';
getHTML(url, function(html) {
var $ = cheerio.load(html);
var questions = [];
$('#ib-main-bar .bix-div-container').each(function() {
var question = {
text: $(this).find('table:nth-child(1)>tr:nth-child(1)>td:nth-child(2)').html(),
options: {},
answer: null
};
var $optionsPart = $(this).find('table:nth-child(1)>tr:nth-child(2)');
question.options.A = $optionsPart.find('tr:nth-child(1)>td:nth-child(2)').html();
question.options.B = $optionsPart.find('tr:nth-child(1)>td:nth-child(4)').html();
question.options.C = $optionsPart.find('tr:nth-child(2)>td:nth-child(2)').html();
question.options.D = $optionsPart.find('tr:nth-child(2)>td:nth-child(4)').html();
question.options.E = $optionsPart.find('tr:nth-child(3)>td:nth-child(2)').html();
question.answer = $optionsPart.find('.jq-hdnakq').val();
questions.push(question);
});
console.log(questions);
});
Возвращает массив объектов с атрибутами: text, options, answer
скриншот: http://joxi.ru/a2X77bSy3975mg