Невозможно получить название продукта со страницы, зашифрованной в javascript
Создавая скребок с целью анализа названия различных продуктов с веб-страницы, когда я запускаю ее, я ничего не получаю. Что я мог видеть, что желаемое содержимое находится внутри элемента javascript. Поскольку я не хочу использовать какой-либо конвертер для сбора этих названий, я ожидаю, что кто-нибудь поможет мне получить это, используя словарь. То, что я попробовал изначально, было так:
Sub RmartData()
Dim http As New XMLHTTP60, html As New HTMLDocument
Dim titelem As Object, title As Object, protitle As Object
With http
.Open "GET", "https://redmart.com/bakery", False
.send
html.body.innerHTML = .responseText
End With
Set titelem = html.getElementsByClassName("description")
For Each title In titelem
Set protitle = title.getElementsByTagName("a")
x = x + 1
Cells(x, 1) = protitle(0).innerText
Next title
End Sub
Осматривая элемент я вижу это:
<div class="description"><!-- react-empty: 288 --><h4 title="Gardenia Hotdog Buns 4's"><a href="/product/gardenia-hotdog-buns-4's-13400">Gardenia Hotdog Buns 4's</a></h4><div><span class="size">220 g</span><a href="#" data-tooltip="Guaranteed fresh for 3+ days, inc. delivery day" class="tag green small tooltip">3D+</a></div><div id="BVRRInlineRating-13400" class="ratings"> <div class="bv-cleanslate bv-cv2-cleanslate"> <div data-bv-v="ratingItem:37" class="bv-shared bv-core-container-134 bv-rating-top-statistic">
<!--[if lt IE 7]> <div class="bv-compat bvie6 bvie-lt8 bvie"> <![endif]--> <!--[if IE 7]> <div class="bv-compat bvie7 bvie-lt8 bvie"> <![endif]--> <!--[if IE 8]> <div class="bv-compat bvie8 bvie"> <![endif]--> <!--[if IE 9]> <div class="bv-compat bvie9 bvie"> <![endif]--> <!--[if gt IE 9]> <!--><div class="bv-compat"> <!--<![endif]--> <div class="bv-inline-rating-container"> <dl class="bv-stars-container" role="presentation"> <dd class="bv-rating-ratio" role="presentation"> <span class="bv-rating-stars-container"> <span class="bv-rating-stars bv-rating-stars-off" aria-hidden="true"> ★★★★★ </span> <span class="bv-rating-stars-on bv-rating-stars" style="width:40% !important;" aria-hidden="true"> ★★★★★ </span> <span class="bv-off-screen">2 out of 5 stars. Read reviews.</span> </span> </dd> <dd class="bv-rating-ratio-count" role="presentation"> <span class="bv-rating-label"> (1) </span> </dd> </dl> </div> </div>
</div> </div></div></div>
В инструментах разработчика Chrome, нажав кнопку ответа, я получаю это:
<!DOCTYPE html><html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" itemscope itemtype="http://schema.org/WebPage"><head><meta charset="utf-8"><title>Online Grocery Shopping - Groceries Delivery Singapore | RedMart</title><meta name="description" content="Singapore's leading online grocery service. Fresh groceries, household essentials and specialty products delivered to your door."><meta name="keywords" content="online groceries, grocery delivery singapore, online grocery shopping, online supermarket"><meta name="p:domain_verify" content="bf4d2b7c4d98534d44b1f2ff3bb02ee7"><meta name="msvalidate.01" content="A8CF80424AEF2B51E2C1313E6EA133BD"/><meta name="google-site-verification" content="AXdSfFmvBak9HqZjP7ihAAN5v2iVUu7BEsbCIcvJgwg"/><meta property="og:type" content="website"><meta property="og:title" content="Online Grocery Shopping - Groceries Delivery Singapore"><meta property="og:description" content="Singapore's leading online grocery service. Fresh groceries, household essentials and specialty products delivered to your door."><meta property="og:url" content="https://redmart.com/"><meta property="og:image" content="https://s3-ap-southeast-1.amazonaws.com/media.redmart.com/assets/logo-FB.png"><meta property="og:site_name" content="RedMart"><meta property="fb:app_id" content="219268451429695"><meta name="git:tag" content="" 2="" 13="" 0=""><meta name="git:rev" content="68f1442"><meta name="git:commit_id" content="68f14427b86c2d64a3733140b5c205a013eb00e2"><meta name="git:date" content="2017-05-11 09:57:13 +0800"><meta name="viewport" content="width=1024"><link rel="icon" type="image/png" href="/img/favicon.ca3eb8fb.png"><meta name="apple-itunes-app" content="app-id=606780396, app-argument={{url}}"><meta name="google-play-app" content="app-id=com.redmart.redmart"><link rel="apple-touch-icon" href="http://media.redmart.com/assets/appicon-152x152.png"><link rel="alternate" href="android-app://com.redmart.redmart"/><meta name="apple-mobile-web-app-capable" content="yes"><meta name="mobile-web-app-capable" content="yes"><link rel="stylesheet" href="/styles/app.afc1f1db.css"><link rel="canonical" href="https://redmart.com/"/><style>.spinner{width:80px;height:80px;margin:0;padding: 0;border:1px solid #ee4054;-webkit-border-radius:50%;border-radius:50%;border-top-color:transparent;border-left-color:transparent;-webkit-animation:spin 600ms infinite linear;-moz-animation:spin 600ms infinite linear;-o-animation:spin 600ms infinite linear;-ms-animation:spin 600ms infinite linear;animation:spin 600ms infinite linear}@-moz-keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}@-webkit-keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}@-o-keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}@keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}</style><body style="background: #ECF0F1"><div id="app" class="firstLoad"><header id="navbar"></header><div id="middle"><aside id="leftSidebar"></aside><section id="layoutContainerWrapper"><section id="stickyNotification"></section><section id="contentBanner"></section><section id="contentTabs"></section><section id="contentWrapper"><section id="contentNavigation"></section><article id="contentSection"></article></section></section><aside id="rightSidebar"></aside></div><footer id="footer"></footer></div><div id="notifications"></div><div id="lightbox" class="hide"></div><div id="modal"></div><div id="tooltips"></div><div id="overlay"></div><div id="offline" class="fullScreen" style="display:none"><div><svg id="logo" viewBox="0 0 1024 237.391"><use xlink:href="#svg-rmlogo"></use></svg><div class="spinner"></div></div></div><script type="text/javascript" src="//display.ugc.bazaarvoice.com/static/redmart/en_SG/bvapi.js"></script><script type="text/javascript" src="//cdn.optimizely.com/js/6758860916.js"></script><script src="/js/loader.8c36ef90.js"></script><svg id="svgtemplater" class="svgtemplater" style="display: none"><g id="svg-icon-cart"><g><path d="M6.334,84.48c62.087,0,124.174,0,186.258,0c7.046,0,7.046-10.924,0-10.924c-62.084,0-124.174,0-186.258,0 C-0.716,73.556-0.716,84.48,6.334,84.48L6.334,84.48z"/></g><g><path d="M1.063,80.473c9.465,29.911,18.931,59.809,28.392,89.715c2.119,6.692,12.668,3.833,10.537-2.901 c-9.465-29.906-18.931-59.814-28.392-89.721C9.479,70.877-1.068,73.734,1.063,80.473L1.063,80.473z"/></g><g><path d="M34.726,174.2c43.357,0,86.718,0,130.075,0c7.042,0,7.042-10.924,0-10.924c-43.357,0-86.718,0-130.075,0 C27.675,163.276,27.675,174.2,34.726,174.2L34.726,174.2z"/></g><g><path d="M170.059,170.188c9.27-29.906,18.529-59.812,27.799-89.715c2.092-6.747-8.458-9.613-10.532-2.907 c-9.27,29.906-18.529,59.818-27.794,89.721C157.443,174.033,167.984,176.897,170.059,170.188L170.059,170.188z"/></g><g><path d="M114.548,88.85C101.13,68.68,87.711,48.512,74.29,28.341c-3.879-5.834-13.344-0.371-9.429,5.514 c13.415,20.172,26.833,40.338,40.255,60.511C108.995,100.2,118.465,94.737,114.548,88.85L114.548,88.85z"/></g><g><path d="M10.192,82.882c8.33-8.331,16.658-16.659,24.988-24.985c4.984-4.984-2.736-12.71-7.727-7.724 c-8.33,8.328-16.658,16.656-24.988,24.982C-2.515,80.142,5.205,87.869,10.192,82.882L10.192,82.882z"/></g><g><path d="M31.32,59.498c46.187,0,92.368,0,138.56,0c7.042,0,7.042-10.924,0-10.924c-46.188,0-92.373,0-138.56,0 C24.277,48.574,24.277,59.498,31.32,59.498L31.32,59.498z"/></g><g><path d="M166.014,57.897c7.949,7.951,15.9,15.903,23.854,23.851c4.985,4.987,12.705-2.739,7.725-7.724 c-7.95-7.95-15.9-15.903-23.851-23.851C168.753,45.188,161.029,52.914,166.014,57.897L166.014,57.897z"/></g></g><g id="svg-icon-frozen"><title>Page 1</title><desc>Created with Sketch.</desc><defs></defs><g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd" stroke-linecap="round"><g transform="translate(1.000000, 1.000000)" stroke="#FFFFFF" stroke-width="3.5"><path d="M25.4453,0.75 L25.4453,54.904" id="Stroke-1"></path><polyline id="Stroke-3" points="33.8522 4.1041 25.4152 12.5411 17.0382 4.1651"></polyline><polyline id="Stroke-5" points="33.8522 51.5496 25.4152 43.1126 17.0382 51.4886"></polyline><path d="M1.9959,14.2884 L48.8949,41.3654" id="Stroke-7"></path><polyline id="Stroke-9" points="9.1042 8.6849 12.1922 20.2109 0.7502 23.2769"></polyline><polyline id="Stroke-11" points="50.1932 32.4076 38.6672 35.4956 41.7332 46.9386"></polyline><path d="M48.8946,14.2883 L1.9956,41.3653" id="Stroke-13"></path><polyline id="Stroke-15" points="50.1932 23.246 38.6672 20.158 41.7332 8.715"></polyline><polyline id="Stroke-17" points="9.1042 46.9688 12.1922 35.4428 0.7502 32.3768"></polyline></g></g></g><g id="svg-icon-leaf"><g id="Side-Nav" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g id="CA-252" transform="translate(-536.000000, -621.000000)"><g id="Group-2" transform="translate(506.000000, 570.000000)"><g id="Group" transform="translate(28.000000, 41.000000)"><g id="Group-5" transform="translate(0.000000, 9.000000)"><rect id="Rectangle-1-Copy-2" x="0" y="0" width="42" height="42"></rect><path d="M21.9947368,7.3162 L21.9947368,16.25 L16.1934316,10.7096 C15.1032,11.4772 14.0726526,12.2828 13.1714211,13.1416 C5.61142105,20.3597 5.72283158,26.2706 8.05847368,29.5595 L2.1,35.25 L2.1,40.95 L11.3331474,32.1321 C14.9241474,33.6179 20.5424211,32.97 27.2389895,26.5746 C35.6803263,18.5148 39.9,1.05 39.9,1.05 C39.9,1.05 30.4460211,3.1457 21.9947368,7.3162" id="Fill-17" fill="#92BC00"></path></g></g></g></g></g></g><g id="svg-icon-notify"><path d="M69.874,2.001c-37.469,0-67.843,30.375-67.843,67.843 c0,37.469,30.374,67.845,67.843,67.845c37.471,0,67.842-30.376,67.842-67.845C137.716,32.375,107.345,2.001,69.874,2.001z M69.874,129.794c-33.108,0-59.947-26.839-59.947-59.95c0-33.109,26.839-59.948,59.947-59.948c33.111,0,59.947,26.839,59.947,59.948 C129.821,102.955,102.985,129.794,69.874,129.794z"/><g><path d="M63.731,96.46c0-3.749,2.603-6.456,6.144-6.456c3.749,0,6.144,2.707,6.144,6.456 c0,3.644-2.395,6.454-6.144,6.454C66.231,102.914,63.731,100.104,63.731,96.46z M66.334,81.571l-1.458-49.978h9.997l-1.458,49.978 H66.334z"/></g></g><g id="svg-icon-otpshield"><image display="none" overflow="visible" enable-background="new " width="56" height="60" xlink:href="8C3C01A583C3DD0C.png" transform="matrix(0.48 0 0 0.48 4.9863 5.3413)"></image><g><g><path fill="#464E55" d="M24.577,3.256c-7.892,0-11.713-2.993-11.725-3.023c-0.061-0.145-0.199-0.244-0.361-0.248 c0,0-0.004,0-0.008,0c-0.156,0-0.301,0.097-0.361,0.242C12.11,0.259,8.262,3.256,0.397,3.256c-0.22,0-0.396,0.179-0.396,0.395 l2.486,13.421c0.647,5.437,9.438,9.694,9.838,9.873c0.053,0.021,0.104,0.03,0.16,0.03c0.051,0,0.105-0.01,0.156-0.03 c0.402-0.179,9.311-4.438,9.843-9.873l2.483-13.421C24.972,3.435,24.795,3.256,24.577,3.256z M12.361,24.018 c-0.312-0.137-7.114-3.416-7.704-7.725L2.673,5.785c0-0.171,0.138-0.31,0.311-0.31c6.157,0,9.211-2.346,9.218-2.371 c0.049-0.115,0.161-0.19,0.285-0.19c0.001,0,0.001,0,0.002,0l0.006,0.002c0.125,0.005,0.234,0.079,0.278,0.193 c0.011,0.022,3.043,2.366,9.222,2.366c0.17,0,0.307,0.139,0.307,0.31l-1.982,10.509c-0.635,4.186-7.393,7.59-7.705,7.725 c-0.039,0.021-0.084,0.029-0.123,0.029C12.444,24.047,12.403,24.041,12.361,24.018z"/></g><g opacity="0.25"><path fill="#464E55" d="M20.792,13.779l1.508-7.994c0-0.171-0.137-0.31-0.307-0.31c-6.179,0-9.211-2.344-9.222-2.366 c-0.044-0.114-0.153-0.188-0.278-0.193l-0.006-0.002V13.45L20.792,13.779z"/></g><g opacity="0.25"><g><polygon fill="#464E55" points="4.182,13.779 4.185,13.779 2.675,5.785 "/></g><g><path fill="#464E55" d="M4.659,16.293c0,4.303,7.39,7.59,7.706,7.725c0.038,0.021,0.083,0.029,0.122,0.029V13.45l-8.303,0.329 L4.659,16.293z"/></g></g></g></g><g id="svg-icon-question"><path d="M69.874,2.001c-37.469,0-67.843,30.375-67.843,67.843 c0,37.469,30.374,67.845,67.843,67.845c37.471,0,67.842-30.376,67.842-67.845C137.716,32.375,107.345,2.001,69.874,2.001z M69.874,129.794c-33.108,0-59.947-26.839-59.947-59.95c0-33.109,26.839-59.948,59.947-59.948c33.111,0,59.947,26.839,59.947,59.948 C129.821,102.955,102.985,129.794,69.874,129.794z"/><g><path d="M79.238,49.913c0-5.291-3.294-9.682-9.682-9.682c-4.392,0-8.386,2.195-11.479,5.789l-4.692-4.292 c4.194-4.791,9.684-8.285,16.769-8.285c10.282,0,17.071,6.09,17.071,15.972c0,13.077-16.072,18.368-14.475,32.346h-7.186 C63.368,66.186,79.238,60.496,79.238,49.913z M69.355,90.145c3.294,0,5.988,2.597,5.988,6.39c0,3.594-2.694,6.189-5.988,6.189 c-3.292,0-5.888-2.596-5.888-6.189C63.468,92.741,66.064,90.145,69.355,90.145z"/></g></g><g id="svg-icon-search"><path d="M61.616,7.624c-28.363,0-51.359,22.996-51.359,51.358c0,28.368,22.996,51.358,51.359,51.358 c28.368,0,51.359-22.99,51.359-51.358C112.975,30.62,89.983,7.624,61.616,7.624z M61.616,98.927 c-22.059,0-39.948-17.885-39.948-39.943c0-22.06,17.89-39.948,39.948-39.948s39.944,17.889,39.944,39.948 C101.56,81.042,83.675,98.927,61.616,98.927z"/><path d="M90.777,92.183l4.038-4.038c1.106-1.116,2.923-1.116,4.039,0l36.315,36.315c1.106,1.116,1.106,2.923,0,4.039 l-4.038,4.038c-1.115,1.106-2.923,1.106-4.039,0L90.777,96.221C89.661,95.105,89.661,93.29,90.777,92.183z"/></g><g id="svg-icon-shop-bag"><path d="M26.683,8.706h-5.302C20.572,4.598,18.261,0,14.239,0c-4.038,0-6.344,4.598-7.143,8.706H2.315L0,39.396 h23.885l6.71-3.344L26.683,8.706z M10.603,8.706c0.156-0.824,0.344-1.781,0.551-2.314c0.892-2.227,1.613-2.905,3.085-2.905 c1.475,0,2.193,0.679,3.076,2.906c0.215,0.535,0.406,1.496,0.555,2.313H10.603z M22.421,35.817l0.421-23.713l1.224-0.047 l2.714,21.685L22.421,35.817z"/></g></svg><div id="mainSpinner" class="spinner" style="position:absolute; left:50%; top:50%; margin-top:-40px; margin-left:-40px"></div><script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "WebSite",
"url": "https://redmart.com/",
"potentialAction": {
"@type": "SearchAction",
"target": "https://redmart.com/search/{productSearch}",
"query-input": "required name=productSearch"
}
}</script>
1 ответ
Исходный HTML-код веб-страницы по ссылке https://redmart.com/bakery не содержит необходимых данных, он использует AJAX. На веб-сайте https://redmart.com/ доступен API. Ответ возвращается в формате JSON. Перейдите на страницу, например, в Chrome, затем откройте окно "Инструменты разработчика" (F12), вкладку "Сеть", перезагрузите (F5) страницу и изучите зарегистрированные XHR. Наиболее релевантными данными являются строки JSON, возвращаемые URL:
https://api.redmart.com/v1.5.7/catalog/search?extent=2&pageSize=6&sort=1&category=bakery
Вы можете использовать приведенный ниже код VBA для получения информации, как описано выше. Импортируйте модуль JSON.bas в проект VBA для обработки JSON.
Option Explicit
Sub Scrape_redmart_com()
Dim sResponse As String
Dim oQuery As Object
Dim sQuery As String
Dim sState As String
Dim vJSON
Dim aResult()
Dim aProdSets()
Dim aProds()
Dim aRows()
Dim aHeader()
Dim j As Long
Dim i As Long
Dim u As Long
' Set query parameters
Set oQuery = CreateObject("Scripting.Dictionary")
With oQuery
.Add "extent", "2"
.Add "pageSize", "6"
.Add "sort", "1"
.Add "category", "bakery"
End With
sQuery = EncodeQueryParams(oQuery)
' Retrieve JSON data
XmlHttpRequest "GET", "https://api.redmart.com/v1.5.7/catalog/search?" & sQuery, "", "", "", sResponse
' Parse JSON response
JSON.Parse sResponse, vJSON, sState
If sState <> "Object" Then
MsgBox "Invalid JSON response"
Exit Sub
End If
' Init output
ThisWorkbook.Sheets(1).Cells.Delete
aResult = Array() ' Set ubound to -1
' Iterate over product sets
aProdSets = vJSON("productSets")
For j = 0 To UBound(aProdSets)
aProds = aProdSets(j)("products")
' Populate result array
u = UBound(aResult) + 1
ReDim Preserve aResult(u + UBound(aProds))
For i = 0 To UBound(aProds)
' Copy properties from product set into each product
Set aResult(u + i) = ExtractKeys( _
aProdSets(j), _
Array( _
"category", _
"on_sale_count", _
"total"), _
aResult(u + i) _
)
' Extract selected properties from product
Set aResult(u + i) = ExtractKeys( _
aProds(i), _
Array( _
"id", _
"title", _
"desc", _
"details", _
"product_life", _
"measure", _
"pricing", _
"sku", _
"img"), _
aResult(u + i) _
)
DoEvents
Next
Next
' Convert combined result to arrays for output
JSON.ToArray aResult, aRows, aHeader
' Output
With ThisWorkbook.Sheets(1)
OutputArray .Cells(1, 1), aHeader
Output2DArray .Cells(2, 1), aRows
.Columns.AutoFit
End With
MsgBox "Completed"
End Sub
Sub XmlHttpRequest(sMethod As String, sUrl As String, arrSetHeaders, sFormData, sRespHeaders As String, sContent As String)
Dim arrHeader
'With CreateObject("Msxml2.ServerXMLHTTP")
' .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
With CreateObject("MSXML2.XMLHTTP")
.Open sMethod, sUrl, False
If IsArray(arrSetHeaders) Then
For Each arrHeader In arrSetHeaders
.SetRequestHeader arrHeader(0), arrHeader(1)
Next
End If
.send sFormData
sRespHeaders = .GetAllResponseHeaders
sContent = .responseText
End With
End Sub
Function EncodeQueryParams(oParams As Object) As String
Dim aParams
Dim i As Long
aParams = oParams.Keys()
For i = 0 To UBound(aParams)
aParams(i) = EncodeUriComponent((aParams(i))) & "=" & EncodeUriComponent((oParams(aParams(i))))
Next
EncodeQueryParams = Join(aParams, "&")
End Function
Function EncodeUriComponent(strText As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
End If
EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)
End Function
Function ExtractKeys(oSource, aKeys, Optional oTarget = Nothing) As Object
Dim vKey
If TypeName(oTarget) <> "Dictionary" Then Set oTarget = CreateObject("Scripting.Dictionary")
For Each vKey In aKeys
If oSource.Exists(vKey) Then
If IsObject(oSource(vKey)) Then
Set oTarget(vKey) = oSource(vKey)
Else
oTarget(vKey) = oSource(vKey)
End If
End If
Next
Set ExtractKeys = oTarget
End Function
Sub OutputArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Вывод для меня выглядит следующим образом:
Кстати, тот же подход применяется в следующих ответах: 1, 2, 3, 4, 5, 6, 7 и 8.