Аргентина супермаркет веб-соскоб

Я пытаюсь очистить данные с сайта:

https://www.disco.com.ar/Comprar/Home.aspx

с помощью макроса в Excel 2013, например, цены в реальном времени, названия продукта и изображения.

Я пробовал веб-запрос Excel, но он не работает.

Есть ли способ сделать это?

2 ответа

Существует пример, показывающий, как данные могут быть получены с веб-сайта с использованием синтаксического анализа XHR и JSON, он состоит из нескольких этапов.

  1. Получить данные.

Я немного разбирался с XHR, используя вкладку Chrome Developer Tools Network. Наиболее важные данные, которые я нашел, - это строка JSON, возвращаемая POST XHR с https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerLimiteDeProductos

POST XHR https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerLimiteDeProductos

POST XHR не работает для меня без заголовка cookie. Таким образом, я должен добавить дополнительную HEAD XHR для извлечения ASP.NET_SessionId сначала cookie, версия сервера XMLHTTP, используемая для управления cookie. Единственными заголовками ответа, возвращающими cookie, является GET XHR с https://www.disco.com.ar/Login/PreHome.aspx

ПОЛУЧИТЕ XHR https://www.disco.com.ar/Login/PreHome.aspx

  1. Полученная строка JSON должна быть проанализирована дважды, так как она содержит второй JSON полезной нагрузки, заключенный в d Свойство первого JSON.
  2. Преобразовать разобранный объект JSON в табличную форму, представленную в 2d-массивах.
  3. Выведите массивы на лист. Вы можете выполнить дальнейшую обработку с прямым доступом к массивам.

Для веб-страницы, показанной ниже:

страница интернета

Вывод для меня выглядит следующим образом:

выход

Поместите приведенный ниже код в стандартный модуль VBA Project:

Option Explicit

Sub GetData()

    Dim sCookie As String
    Dim sPayLoad As String
    Dim sCont As String
    Dim vJSON As Variant
    Dim sState As String
    Dim y As Long
    Dim sSection As Variant
    Dim aData()
    Dim aHeader()

    ' Get cookie from the site
    With CreateObject("MSXML2.ServerXMLHTTP")
        .Open "HEAD", "https://www.disco.com.ar/Login/PreHome.aspx", False
        .Send
        sCookie = .getAllResponseHeaders
    End With
    sCookie = Split(sCookie, "Set-Cookie: ", 2)(1)
    sCookie = Split(sCookie, ";", 2)(0)
    ' Retrieve JSON data
    sPayLoad = "{IdMenu:""21063"",textoBusqueda:"""", producto:"""", marca:"""", " & _
        "pager:"""", ordenamiento:0, precioDesde:"""", precioHasta:""""}"
    With CreateObject("MSXML2.ServerXMLHTTP")
        .Open "POST", "https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerArticulosPorDescripcionMarcaFamiliaLevex", False
        .SetRequestHeader "Accept", "application/json, text/javascript, */*; q=0.01"
        .SetRequestHeader "Content-Type", "application/json; charset=utf-8"
        .SetRequestHeader "Content-Length", Len(sPayLoad)
        .SetRequestHeader "Cookie", sCookie
        .Send CStr(sPayLoad)
        sCont = .responseText
    End With
    ' Parse JSON response
    JSON.Parse sCont, vJSON, sState
    sCont = vJSON.Item("d")
    JSON.Parse sCont, vJSON, sState
    ' Output tables
    Sheets(1).Cells.Delete
    y = 1
    For Each sSection In Array("Tipo", "Marca", "Precio", "ResultadosBusquedaLevex", "ArticulosSugereridos")
        JSON.ToArray vJSON.Item(sSection), aData, aHeader
        With Sheets(1)
            .Cells(y, 1).Value = sSection
            OutputArray .Cells(y + 1, 1), aHeader
            Output2DArray .Cells(y + 2, 1), aData
            .Cells.Columns.AutoFit
        End With
        y = y + UBound(aData, 1) + 4
    Next

End Sub

Sub OutputArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                1, _
                UBound(aCells) - LBound(aCells) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Создайте еще один стандартный модуль, назовите его JSON и поместите в него приведенный ниже код, этот код обеспечивает функциональность обработки JSON:

Option Explicit

Private sBuffer As String
Private oTokens As Object
Private oRegEx As Object
Private bMatch As Boolean
Private oChunks As Object
Private oHeader As Object
Private aData() As Variant
Private i As Long

Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)

    ' Backus–Naur form JSON parser implementation based on RegEx
    ' Input:
    ' sSample - source JSON string
    ' Output:
    ' vJson - created object or array to be returned as result
    ' sState - string Object|Array|Error depending on processing

    sBuffer = sSample
    Set oTokens = CreateObject("Scripting.Dictionary")
    Set oRegEx = CreateObject("VBScript.RegExp")
    With oRegEx ' Patterns based on specification http://www.json.org/
        .Global = True
        .MultiLine = True
        .IgnoreCase = True ' Unspecified True, False, Null accepted
        .Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
        Tokenize "s"
        .Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
        Tokenize "d"
        .Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
        Tokenize "c"
        .Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
        Tokenize "n"
        .Pattern = "\s+"
        sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
        .MultiLine = False
        Do
            bMatch = False
            .Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
            Tokenize "p"
            .Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
            Tokenize "o"
            .Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
            Tokenize "a"
        Loop While bMatch
        .Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
        If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
            Retrieve sBuffer, vJSON
            sState = IIf(IsObject(vJSON), "Object", "Array")
        Else
            vJSON = Null
            sState = "Error"
        End If
    End With
    Set oTokens = Nothing
    Set oRegEx = Nothing

End Sub

Private Sub Tokenize(sType)

    Dim aContent() As String
    Dim lCopyIndex As Long
    Dim i As Long
    Dim sKey As String

    With oRegEx.Execute(sBuffer)
        If .Count = 0 Then Exit Sub
        ReDim aContent(0 To .Count - 1)
        lCopyIndex = 1
        For i = 0 To .Count - 1
            With .Item(i)
                sKey = "<" & oTokens.Count & sType & ">"
                oTokens(sKey) = .Value
                aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
                lCopyIndex = .FirstIndex + .Length + 1
            End With
        Next
    End With
    sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
    bMatch = True

End Sub

Private Sub Retrieve(sTokenKey, vTransfer)

    Dim sTokenValue As String
    Dim sName As String
    Dim vValue As Variant
    Dim aTokens() As String
    Dim i As Long

    sTokenValue = oTokens(sTokenKey)
    With oRegEx
        .Global = True
        Select Case Left(Right(sTokenKey, 2), 1)
            Case "o"
                Set vTransfer = CreateObject("Scripting.Dictionary")
                aTokens = Split(sTokenValue, "<")
                For i = 1 To UBound(aTokens)
                    Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
                Next
            Case "p"
                aTokens = Split(sTokenValue, "<", 4)
                Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
                Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
                If IsObject(vValue) Then
                    Set vTransfer(sName) = vValue
                Else
                    vTransfer(sName) = vValue
                End If
            Case "a"
                aTokens = Split(sTokenValue, "<")
                If UBound(aTokens) = 0 Then
                    vTransfer = Array()
                Else
                    ReDim vTransfer(0 To UBound(aTokens) - 1)
                    For i = 1 To UBound(aTokens)
                        Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
                        If IsObject(vValue) Then
                            Set vTransfer(i - 1) = vValue
                        Else
                            vTransfer(i - 1) = vValue
                        End If
                    Next
                End If
            Case "n"
                vTransfer = sTokenValue
            Case "s"
                vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
                    Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
                    "\""", """"), _
                    "\\", "\"), _
                    "\/", "/"), _
                    "\b", Chr(8)), _
                    "\f", Chr(12)), _
                    "\n", vbLf), _
                    "\r", vbCr), _
                    "\t", vbTab)
                .Global = False
                .Pattern = "\\u[0-9a-fA-F]{4}"
                Do While .Test(vTransfer)
                    vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
                Loop
            Case "d"
                vTransfer = Evaluate(sTokenValue)
            Case "c"
                Select Case LCase(sTokenValue)
                    Case "true"
                        vTransfer = True
                    Case "false"
                        vTransfer = False
                    Case "null"
                        vTransfer = Null
                End Select
        End Select
    End With

End Sub

Function Serialize(vJSON As Variant) As String

    Set oChunks = CreateObject("Scripting.Dictionary")
    SerializeElement vJSON, ""
    Serialize = Join(oChunks.Items(), "")
    Set oChunks = Nothing

End Function

Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)

    Dim aKeys() As Variant
    Dim i As Long

    With oChunks
        Select Case VarType(vElement)
            Case vbObject
                If vElement.Count = 0 Then
                    .Item(.Count) = "{}"
                Else
                    .Item(.Count) = "{" & vbCrLf
                    aKeys = vElement.Keys
                    For i = 0 To UBound(aKeys)
                        .Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
                        SerializeElement vElement(aKeys(i)), sIndent & vbTab
                        If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
                        .Item(.Count) = vbCrLf
                    Next
                    .Item(.Count) = sIndent & "}"
                End If
            Case Is >= vbArray
                If UBound(vElement) = -1 Then
                    .Item(.Count) = "[]"
                Else
                    .Item(.Count) = "[" & vbCrLf
                    For i = 0 To UBound(vElement)
                        .Item(.Count) = sIndent & vbTab
                        SerializeElement vElement(i), sIndent & vbTab
                        If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
                        .Item(.Count) = vbCrLf
                    Next
                    .Item(.Count) = sIndent & "]"
                End If
            Case vbInteger, vbLong
                .Item(.Count) = vElement
            Case vbSingle, vbDouble
                .Item(.Count) = Replace(vElement, ",", ".")
            Case vbNull
                .Item(.Count) = "null"
            Case vbBoolean
                .Item(.Count) = IIf(vElement, "true", "false")
            Case Else
                .Item(.Count) = """" & _
                    Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
                        "\", "\\"), _
                        """", "\"""), _
                        "/", "\/"), _
                        Chr(8), "\b"), _
                        Chr(12), "\f"), _
                        vbLf, "\n"), _
                        vbCr, "\r"), _
                        vbTab, "\t") & _
                    """"
        End Select
    End With

End Sub

Function ToString(vJSON As Variant) As String

    Select Case VarType(vJSON)
        Case vbObject, Is >= vbArray
            Set oChunks = CreateObject("Scripting.Dictionary")
            ToStringElement vJSON, ""
            oChunks.Remove 0
            ToString = Join(oChunks.Items(), "")
            Set oChunks = Nothing
        Case vbNull
            ToString = "Null"
        Case vbBoolean
            ToString = IIf(vJSON, "True", "False")
        Case Else
            ToString = CStr(vJSON)
    End Select

End Function

Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)

    Dim aKeys() As Variant
    Dim i As Long

    With oChunks
        Select Case VarType(vElement)
            Case vbObject
                If vElement.Count = 0 Then
                    .Item(.Count) = "''"
                Else
                    .Item(.Count) = vbCrLf
                    aKeys = vElement.Keys
                    For i = 0 To UBound(aKeys)
                        .Item(.Count) = sIndent & aKeys(i) & ": "
                        ToStringElement vElement(aKeys(i)), sIndent & vbTab
                        If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
                    Next
                End If
            Case Is >= vbArray
                If UBound(vElement) = -1 Then
                    .Item(.Count) = "''"
                Else
                    .Item(.Count) = vbCrLf
                    For i = 0 To UBound(vElement)
                        .Item(.Count) = sIndent & i & ": "
                        ToStringElement vElement(i), sIndent & vbTab
                        If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
                    Next
                End If
            Case vbNull
                .Item(.Count) = "Null"
            Case vbBoolean
                .Item(.Count) = IIf(vElement, "True", "False")
            Case Else
                .Item(.Count) = CStr(vElement)
        End Select
    End With

End Sub

Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)

    ' Input:
    ' vJSON - Array or Object which contains rows data
    ' Output:
    ' aData - 2d array representing JSON data
    ' aHeader - 1d array of property names

    Dim sName As Variant

    Set oHeader = CreateObject("Scripting.Dictionary")
    Select Case VarType(vJSON)
        Case vbObject
            If vJSON.Count > 0 Then
                ReDim aData(0 To vJSON.Count - 1, 0 To 0)
                oHeader("#") = 0
                i = 0
                For Each sName In vJSON
                    aData(i, 0) = "#" & sName
                    ToArrayElement vJSON(sName), ""
                    i = i + 1
                Next
            Else
                ReDim aData(0 To 0, 0 To 0)
            End If
        Case Is >= vbArray
            If UBound(vJSON) >= 0 Then
                ReDim aData(0 To UBound(vJSON), 0 To 0)
                For i = 0 To UBound(vJSON)
                    ToArrayElement vJSON(i), ""
                Next
            Else
                ReDim aData(0 To 0, 0 To 0)
            End If
        Case Else
            ReDim aData(0 To 0, 0 To 0)
            aData(0, 0) = ToString(vJSON)
    End Select
    aHeader = oHeader.Keys()
    Set oHeader = Nothing
    aRows = aData
    Erase aData

End Sub

Private Sub ToArrayElement(vElement As Variant, sFieldName As String)

    Dim sName As Variant
    Dim j As Long

    Select Case VarType(vElement)
        Case vbObject ' collection of objects
            For Each sName In vElement
                ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
            Next
        Case Is >= vbArray  ' collection of arrays
            For j = 0 To UBound(vElement)
                ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
            Next
        Case Else
            If Not oHeader.Exists(sFieldName) Then
                oHeader(sFieldName) = oHeader.Count
                If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
            End If
            j = oHeader(sFieldName)
            aData(i, j) = ToString(vElement)
    End Select

End Sub

Проверьте VBA-JSON-анализатор на GitHub для получения последней версии анализатора JSON (импортируйте модуль JSON.bas в проект VBA для обработки JSON).

Я бы посоветовал вам использовать Selenium с Python. Настройка займет некоторое время, но как только это будет сделано, у вас будет идеальный инструмент для работы. Это позволит вам очищать любой веб-сайт, который вам нужен, используя простой синтаксис Python, используя нужный веб-браузер (Firefox или Chrome), читая и взаимодействуя с javascript. Я использую его ежедневно.

Другие вопросы по тегам