Невозможно получить название продукта со страницы, зашифрованной в javascript

Создавая скребок с целью анализа названия различных продуктов с веб-страницы, когда я запускаю ее, я ничего не получаю. Что я мог видеть, что желаемое содержимое находится внутри элемента javascript. Поскольку я не хочу использовать какой-либо конвертер для сбора этих названий, я ожидаю, что кто-нибудь поможет мне получить это, используя словарь. То, что я попробовал изначально, было так:

Sub RmartData()
Dim http As New XMLHTTP60, html As New HTMLDocument
Dim titelem As Object, title As Object, protitle As Object

With http
    .Open "GET", "https://redmart.com/bakery", False
    .send
    html.body.innerHTML = .responseText
End With

Set titelem = html.getElementsByClassName("description")
For Each title In titelem
Set protitle = title.getElementsByTagName("a")
     x = x + 1
    Cells(x, 1) = protitle(0).innerText
 Next title
End Sub

Осматривая элемент я вижу это:

<div class="description"><!-- react-empty: 288 --><h4 title="Gardenia Hotdog Buns 4's"><a href="/product/gardenia-hotdog-buns-4's-13400">Gardenia Hotdog Buns 4's</a></h4><div><span class="size">220 g</span><a href="#" data-tooltip="Guaranteed fresh for 3+ days, inc. delivery day" class="tag green small tooltip">3D+</a></div><div id="BVRRInlineRating-13400" class="ratings"> <div class="bv-cleanslate bv-cv2-cleanslate"> <div data-bv-v="ratingItem:37" class="bv-shared bv-core-container-134 bv-rating-top-statistic">
 <!--[if lt IE 7]> <div class="bv-compat bvie6 bvie-lt8 bvie"> <![endif]--> <!--[if IE 7]> <div class="bv-compat bvie7 bvie-lt8 bvie"> <![endif]--> <!--[if IE 8]> <div class="bv-compat bvie8 bvie"> <![endif]--> <!--[if IE 9]> <div class="bv-compat bvie9 bvie"> <![endif]--> <!--[if gt IE 9]> <!--><div class="bv-compat"> <!--<![endif]--> <div class="bv-inline-rating-container">   <dl class="bv-stars-container" role="presentation">    <dd class="bv-rating-ratio" role="presentation">  <span class="bv-rating-stars-container">   <span class="bv-rating-stars bv-rating-stars-off" aria-hidden="true">  ★★★★★  </span> <span class="bv-rating-stars-on bv-rating-stars" style="width:40% !important;" aria-hidden="true">  ★★★★★  </span>     <span class="bv-off-screen">2 out of 5 stars. Read reviews.</span>     </span>  </dd>      <dd class="bv-rating-ratio-count" role="presentation"> <span class="bv-rating-label">   (1)  </span>  </dd>   </dl>   </div> </div> 
</div> </div></div></div>

В инструментах разработчика Chrome, нажав кнопку ответа, я получаю это:

<!DOCTYPE html><html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" itemscope itemtype="http://schema.org/WebPage"><head><meta charset="utf-8"><title>Online Grocery Shopping - Groceries Delivery Singapore | RedMart</title><meta name="description" content="Singapore's leading online grocery service. Fresh groceries, household essentials and specialty products delivered to your door."><meta name="keywords" content="online groceries, grocery delivery singapore, online grocery shopping, online supermarket"><meta name="p:domain_verify" content="bf4d2b7c4d98534d44b1f2ff3bb02ee7"><meta name="msvalidate.01" content="A8CF80424AEF2B51E2C1313E6EA133BD"/><meta name="google-site-verification" content="AXdSfFmvBak9HqZjP7ihAAN5v2iVUu7BEsbCIcvJgwg"/><meta property="og:type" content="website"><meta property="og:title" content="Online Grocery Shopping - Groceries Delivery Singapore"><meta property="og:description" content="Singapore's leading online grocery service. Fresh groceries, household essentials and specialty products delivered to your door."><meta property="og:url" content="https://redmart.com/"><meta property="og:image" content="https://s3-ap-southeast-1.amazonaws.com/media.redmart.com/assets/logo-FB.png"><meta property="og:site_name" content="RedMart"><meta property="fb:app_id" content="219268451429695"><meta name="git:tag" content="" 2="" 13="" 0=""><meta name="git:rev" content="68f1442"><meta name="git:commit_id" content="68f14427b86c2d64a3733140b5c205a013eb00e2"><meta name="git:date" content="2017-05-11 09:57:13 +0800"><meta name="viewport" content="width=1024"><link rel="icon" type="image/png" href="/img/favicon.ca3eb8fb.png"><meta name="apple-itunes-app" content="app-id=606780396, app-argument={{url}}"><meta name="google-play-app" content="app-id=com.redmart.redmart"><link rel="apple-touch-icon" href="http://media.redmart.com/assets/appicon-152x152.png"><link rel="alternate" href="android-app://com.redmart.redmart"/><meta name="apple-mobile-web-app-capable" content="yes"><meta name="mobile-web-app-capable" content="yes"><link rel="stylesheet" href="/styles/app.afc1f1db.css"><link rel="canonical" href="https://redmart.com/"/><style>.spinner{width:80px;height:80px;margin:0;padding: 0;border:1px solid #ee4054;-webkit-border-radius:50%;border-radius:50%;border-top-color:transparent;border-left-color:transparent;-webkit-animation:spin 600ms infinite linear;-moz-animation:spin 600ms infinite linear;-o-animation:spin 600ms infinite linear;-ms-animation:spin 600ms infinite linear;animation:spin 600ms infinite linear}@-moz-keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}@-webkit-keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}@-o-keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}@keyframes spin{100%{-webkit-transform:rotate(360deg);-moz-transform:rotate(360deg);-o-transform:rotate(360deg);-ms-transform:rotate(360deg);transform:rotate(360deg)}}</style><body style="background: #ECF0F1"><div id="app" class="firstLoad"><header id="navbar"></header><div id="middle"><aside id="leftSidebar"></aside><section id="layoutContainerWrapper"><section id="stickyNotification"></section><section id="contentBanner"></section><section id="contentTabs"></section><section id="contentWrapper"><section id="contentNavigation"></section><article id="contentSection"></article></section></section><aside id="rightSidebar"></aside></div><footer id="footer"></footer></div><div id="notifications"></div><div id="lightbox" class="hide"></div><div id="modal"></div><div id="tooltips"></div><div id="overlay"></div><div id="offline" class="fullScreen" style="display:none"><div><svg id="logo" viewBox="0 0 1024 237.391"><use xlink:href="#svg-rmlogo"></use></svg><div class="spinner"></div></div></div><script type="text/javascript" src="//display.ugc.bazaarvoice.com/static/redmart/en_SG/bvapi.js"></script><script type="text/javascript" src="//cdn.optimizely.com/js/6758860916.js"></script><script src="/js/loader.8c36ef90.js"></script><svg id="svgtemplater" class="svgtemplater" style="display: none"><g id="svg-icon-cart"><g><path d="M6.334,84.48c62.087,0,124.174,0,186.258,0c7.046,0,7.046-10.924,0-10.924c-62.084,0-124.174,0-186.258,0 C-0.716,73.556-0.716,84.48,6.334,84.48L6.334,84.48z"/></g><g><path d="M1.063,80.473c9.465,29.911,18.931,59.809,28.392,89.715c2.119,6.692,12.668,3.833,10.537-2.901 c-9.465-29.906-18.931-59.814-28.392-89.721C9.479,70.877-1.068,73.734,1.063,80.473L1.063,80.473z"/></g><g><path d="M34.726,174.2c43.357,0,86.718,0,130.075,0c7.042,0,7.042-10.924,0-10.924c-43.357,0-86.718,0-130.075,0 C27.675,163.276,27.675,174.2,34.726,174.2L34.726,174.2z"/></g><g><path d="M170.059,170.188c9.27-29.906,18.529-59.812,27.799-89.715c2.092-6.747-8.458-9.613-10.532-2.907 c-9.27,29.906-18.529,59.818-27.794,89.721C157.443,174.033,167.984,176.897,170.059,170.188L170.059,170.188z"/></g><g><path d="M114.548,88.85C101.13,68.68,87.711,48.512,74.29,28.341c-3.879-5.834-13.344-0.371-9.429,5.514 c13.415,20.172,26.833,40.338,40.255,60.511C108.995,100.2,118.465,94.737,114.548,88.85L114.548,88.85z"/></g><g><path d="M10.192,82.882c8.33-8.331,16.658-16.659,24.988-24.985c4.984-4.984-2.736-12.71-7.727-7.724 c-8.33,8.328-16.658,16.656-24.988,24.982C-2.515,80.142,5.205,87.869,10.192,82.882L10.192,82.882z"/></g><g><path d="M31.32,59.498c46.187,0,92.368,0,138.56,0c7.042,0,7.042-10.924,0-10.924c-46.188,0-92.373,0-138.56,0 C24.277,48.574,24.277,59.498,31.32,59.498L31.32,59.498z"/></g><g><path d="M166.014,57.897c7.949,7.951,15.9,15.903,23.854,23.851c4.985,4.987,12.705-2.739,7.725-7.724 c-7.95-7.95-15.9-15.903-23.851-23.851C168.753,45.188,161.029,52.914,166.014,57.897L166.014,57.897z"/></g></g><g id="svg-icon-frozen"><title>Page 1</title><desc>Created with Sketch.</desc><defs></defs><g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd" stroke-linecap="round"><g transform="translate(1.000000, 1.000000)" stroke="#FFFFFF" stroke-width="3.5"><path d="M25.4453,0.75 L25.4453,54.904" id="Stroke-1"></path><polyline id="Stroke-3" points="33.8522 4.1041 25.4152 12.5411 17.0382 4.1651"></polyline><polyline id="Stroke-5" points="33.8522 51.5496 25.4152 43.1126 17.0382 51.4886"></polyline><path d="M1.9959,14.2884 L48.8949,41.3654" id="Stroke-7"></path><polyline id="Stroke-9" points="9.1042 8.6849 12.1922 20.2109 0.7502 23.2769"></polyline><polyline id="Stroke-11" points="50.1932 32.4076 38.6672 35.4956 41.7332 46.9386"></polyline><path d="M48.8946,14.2883 L1.9956,41.3653" id="Stroke-13"></path><polyline id="Stroke-15" points="50.1932 23.246 38.6672 20.158 41.7332 8.715"></polyline><polyline id="Stroke-17" points="9.1042 46.9688 12.1922 35.4428 0.7502 32.3768"></polyline></g></g></g><g id="svg-icon-leaf"><g id="Side-Nav" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g id="CA-252" transform="translate(-536.000000, -621.000000)"><g id="Group-2" transform="translate(506.000000, 570.000000)"><g id="Group" transform="translate(28.000000, 41.000000)"><g id="Group-5" transform="translate(0.000000, 9.000000)"><rect id="Rectangle-1-Copy-2" x="0" y="0" width="42" height="42"></rect><path d="M21.9947368,7.3162 L21.9947368,16.25 L16.1934316,10.7096 C15.1032,11.4772 14.0726526,12.2828 13.1714211,13.1416 C5.61142105,20.3597 5.72283158,26.2706 8.05847368,29.5595 L2.1,35.25 L2.1,40.95 L11.3331474,32.1321 C14.9241474,33.6179 20.5424211,32.97 27.2389895,26.5746 C35.6803263,18.5148 39.9,1.05 39.9,1.05 C39.9,1.05 30.4460211,3.1457 21.9947368,7.3162" id="Fill-17" fill="#92BC00"></path></g></g></g></g></g></g><g id="svg-icon-notify"><path d="M69.874,2.001c-37.469,0-67.843,30.375-67.843,67.843 c0,37.469,30.374,67.845,67.843,67.845c37.471,0,67.842-30.376,67.842-67.845C137.716,32.375,107.345,2.001,69.874,2.001z M69.874,129.794c-33.108,0-59.947-26.839-59.947-59.95c0-33.109,26.839-59.948,59.947-59.948c33.111,0,59.947,26.839,59.947,59.948 C129.821,102.955,102.985,129.794,69.874,129.794z"/><g><path d="M63.731,96.46c0-3.749,2.603-6.456,6.144-6.456c3.749,0,6.144,2.707,6.144,6.456 c0,3.644-2.395,6.454-6.144,6.454C66.231,102.914,63.731,100.104,63.731,96.46z M66.334,81.571l-1.458-49.978h9.997l-1.458,49.978 H66.334z"/></g></g><g id="svg-icon-otpshield"><image display="none" overflow="visible" enable-background="new " width="56" height="60" xlink:href="8C3C01A583C3DD0C.png" transform="matrix(0.48 0 0 0.48 4.9863 5.3413)"></image><g><g><path fill="#464E55" d="M24.577,3.256c-7.892,0-11.713-2.993-11.725-3.023c-0.061-0.145-0.199-0.244-0.361-0.248 c0,0-0.004,0-0.008,0c-0.156,0-0.301,0.097-0.361,0.242C12.11,0.259,8.262,3.256,0.397,3.256c-0.22,0-0.396,0.179-0.396,0.395 l2.486,13.421c0.647,5.437,9.438,9.694,9.838,9.873c0.053,0.021,0.104,0.03,0.16,0.03c0.051,0,0.105-0.01,0.156-0.03 c0.402-0.179,9.311-4.438,9.843-9.873l2.483-13.421C24.972,3.435,24.795,3.256,24.577,3.256z M12.361,24.018 c-0.312-0.137-7.114-3.416-7.704-7.725L2.673,5.785c0-0.171,0.138-0.31,0.311-0.31c6.157,0,9.211-2.346,9.218-2.371 c0.049-0.115,0.161-0.19,0.285-0.19c0.001,0,0.001,0,0.002,0l0.006,0.002c0.125,0.005,0.234,0.079,0.278,0.193 c0.011,0.022,3.043,2.366,9.222,2.366c0.17,0,0.307,0.139,0.307,0.31l-1.982,10.509c-0.635,4.186-7.393,7.59-7.705,7.725 c-0.039,0.021-0.084,0.029-0.123,0.029C12.444,24.047,12.403,24.041,12.361,24.018z"/></g><g opacity="0.25"><path fill="#464E55" d="M20.792,13.779l1.508-7.994c0-0.171-0.137-0.31-0.307-0.31c-6.179,0-9.211-2.344-9.222-2.366 c-0.044-0.114-0.153-0.188-0.278-0.193l-0.006-0.002V13.45L20.792,13.779z"/></g><g opacity="0.25"><g><polygon fill="#464E55" points="4.182,13.779 4.185,13.779 2.675,5.785 "/></g><g><path fill="#464E55" d="M4.659,16.293c0,4.303,7.39,7.59,7.706,7.725c0.038,0.021,0.083,0.029,0.122,0.029V13.45l-8.303,0.329 L4.659,16.293z"/></g></g></g></g><g id="svg-icon-question"><path d="M69.874,2.001c-37.469,0-67.843,30.375-67.843,67.843 c0,37.469,30.374,67.845,67.843,67.845c37.471,0,67.842-30.376,67.842-67.845C137.716,32.375,107.345,2.001,69.874,2.001z M69.874,129.794c-33.108,0-59.947-26.839-59.947-59.95c0-33.109,26.839-59.948,59.947-59.948c33.111,0,59.947,26.839,59.947,59.948 C129.821,102.955,102.985,129.794,69.874,129.794z"/><g><path d="M79.238,49.913c0-5.291-3.294-9.682-9.682-9.682c-4.392,0-8.386,2.195-11.479,5.789l-4.692-4.292 c4.194-4.791,9.684-8.285,16.769-8.285c10.282,0,17.071,6.09,17.071,15.972c0,13.077-16.072,18.368-14.475,32.346h-7.186 C63.368,66.186,79.238,60.496,79.238,49.913z M69.355,90.145c3.294,0,5.988,2.597,5.988,6.39c0,3.594-2.694,6.189-5.988,6.189 c-3.292,0-5.888-2.596-5.888-6.189C63.468,92.741,66.064,90.145,69.355,90.145z"/></g></g><g id="svg-icon-search"><path d="M61.616,7.624c-28.363,0-51.359,22.996-51.359,51.358c0,28.368,22.996,51.358,51.359,51.358 c28.368,0,51.359-22.99,51.359-51.358C112.975,30.62,89.983,7.624,61.616,7.624z M61.616,98.927 c-22.059,0-39.948-17.885-39.948-39.943c0-22.06,17.89-39.948,39.948-39.948s39.944,17.889,39.944,39.948 C101.56,81.042,83.675,98.927,61.616,98.927z"/><path d="M90.777,92.183l4.038-4.038c1.106-1.116,2.923-1.116,4.039,0l36.315,36.315c1.106,1.116,1.106,2.923,0,4.039 l-4.038,4.038c-1.115,1.106-2.923,1.106-4.039,0L90.777,96.221C89.661,95.105,89.661,93.29,90.777,92.183z"/></g><g id="svg-icon-shop-bag"><path d="M26.683,8.706h-5.302C20.572,4.598,18.261,0,14.239,0c-4.038,0-6.344,4.598-7.143,8.706H2.315L0,39.396 h23.885l6.71-3.344L26.683,8.706z M10.603,8.706c0.156-0.824,0.344-1.781,0.551-2.314c0.892-2.227,1.613-2.905,3.085-2.905 c1.475,0,2.193,0.679,3.076,2.906c0.215,0.535,0.406,1.496,0.555,2.313H10.603z M22.421,35.817l0.421-23.713l1.224-0.047 l2.714,21.685L22.421,35.817z"/></g></svg><div id="mainSpinner" class="spinner" style="position:absolute; left:50%; top:50%; margin-top:-40px; margin-left:-40px"></div><script type="application/ld+json">{
  "@context": "http://schema.org",
  "@type": "WebSite",
  "url": "https://redmart.com/",
  "potentialAction": {
    "@type": "SearchAction",
    "target": "https://redmart.com/search/{productSearch}",
    "query-input": "required name=productSearch"
  }
}</script>

1 ответ

Решение

Исходный HTML-код веб-страницы по ссылке https://redmart.com/bakery не содержит необходимых данных, он использует AJAX. На веб-сайте https://redmart.com/ доступен API. Ответ возвращается в формате JSON. Перейдите на страницу, например, в Chrome, затем откройте окно "Инструменты разработчика" (F12), вкладку "Сеть", перезагрузите (F5) страницу и изучите зарегистрированные XHR. Наиболее релевантными данными являются строки JSON, возвращаемые URL:

https://api.redmart.com/v1.5.7/catalog/search?extent=2&pageSize=6&sort=1&category=bakery

XHR-просмотр

XHR-заголовки

Вы можете использовать приведенный ниже код VBA для получения информации, как описано выше. Импортируйте модуль JSON.bas в проект VBA для обработки JSON.

Option Explicit

Sub Scrape_redmart_com()

    Dim sResponse As String
    Dim oQuery As Object
    Dim sQuery As String
    Dim sState As String
    Dim vJSON
    Dim aResult()
    Dim aProdSets()
    Dim aProds()
    Dim aRows()
    Dim aHeader()
    Dim j As Long
    Dim i As Long
    Dim u As Long

    ' Set query parameters
    Set oQuery = CreateObject("Scripting.Dictionary")
    With oQuery
        .Add "extent", "2"
        .Add "pageSize", "6"
        .Add "sort", "1"
        .Add "category", "bakery"
    End With
    sQuery = EncodeQueryParams(oQuery)
    ' Retrieve JSON data
    XmlHttpRequest "GET", "https://api.redmart.com/v1.5.7/catalog/search?" & sQuery, "", "", "", sResponse
    ' Parse JSON response
    JSON.Parse sResponse, vJSON, sState
    If sState <> "Object" Then
        MsgBox "Invalid JSON response"
        Exit Sub
    End If
    ' Init output
    ThisWorkbook.Sheets(1).Cells.Delete
    aResult = Array() ' Set ubound to -1
    ' Iterate over product sets
    aProdSets = vJSON("productSets")
    For j = 0 To UBound(aProdSets)
        aProds = aProdSets(j)("products")
        ' Populate result array
        u = UBound(aResult) + 1
        ReDim Preserve aResult(u + UBound(aProds))
        For i = 0 To UBound(aProds)
            ' Copy properties from product set into each product
            Set aResult(u + i) = ExtractKeys( _
                aProdSets(j), _
                Array( _
                    "category", _
                    "on_sale_count", _
                    "total"), _
                aResult(u + i) _
            )
            ' Extract selected properties from product
            Set aResult(u + i) = ExtractKeys( _
                aProds(i), _
                Array( _
                    "id", _
                    "title", _
                    "desc", _
                    "details", _
                    "product_life", _
                    "measure", _
                    "pricing", _
                    "sku", _
                    "img"), _
                aResult(u + i) _
            )

            DoEvents
        Next
    Next
    ' Convert combined result to arrays for output
    JSON.ToArray aResult, aRows, aHeader
    ' Output
    With ThisWorkbook.Sheets(1)
        OutputArray .Cells(1, 1), aHeader
        Output2DArray .Cells(2, 1), aRows
        .Columns.AutoFit
    End With

    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sMethod As String, sUrl As String, arrSetHeaders, sFormData, sRespHeaders As String, sContent As String)

    Dim arrHeader

    'With CreateObject("Msxml2.ServerXMLHTTP")
    '    .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False
        If IsArray(arrSetHeaders) Then
            For Each arrHeader In arrSetHeaders
                .SetRequestHeader arrHeader(0), arrHeader(1)
            Next
        End If
        .send sFormData
        sRespHeaders = .GetAllResponseHeaders
        sContent = .responseText
    End With

End Sub

Function EncodeQueryParams(oParams As Object) As String

    Dim aParams
    Dim i As Long

    aParams = oParams.Keys()
    For i = 0 To UBound(aParams)
        aParams(i) = EncodeUriComponent((aParams(i))) & "=" & EncodeUriComponent((oParams(aParams(i))))
    Next
    EncodeQueryParams = Join(aParams, "&")

End Function

Function EncodeUriComponent(strText As String) As String

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
    End If
    EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)

End Function

Function ExtractKeys(oSource, aKeys, Optional oTarget = Nothing) As Object

    Dim vKey

    If TypeName(oTarget) <> "Dictionary" Then Set oTarget = CreateObject("Scripting.Dictionary")
    For Each vKey In aKeys
        If oSource.Exists(vKey) Then
            If IsObject(oSource(vKey)) Then
                Set oTarget(vKey) = oSource(vKey)
            Else
                oTarget(vKey) = oSource(vKey)
            End If
        End If
    Next
    Set ExtractKeys = oTarget

End Function

Sub OutputArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Вывод для меня выглядит следующим образом:

выход

Кстати, тот же подход применяется в следующих ответах: 1, 2, 3, 4, 5, 6, 7 и 8.

Другие вопросы по тегам