Аргентина супермаркет веб-соскоб
Я пытаюсь очистить данные с сайта:
https://www.disco.com.ar/Comprar/Home.aspx
с помощью макроса в Excel 2013, например, цены в реальном времени, названия продукта и изображения.
Я пробовал веб-запрос Excel, но он не работает.
Есть ли способ сделать это?
2 ответа
Существует пример, показывающий, как данные могут быть получены с веб-сайта с использованием синтаксического анализа XHR и JSON, он состоит из нескольких этапов.
- Получить данные.
Я немного разбирался с XHR, используя вкладку Chrome Developer Tools Network. Наиболее важные данные, которые я нашел, - это строка JSON, возвращаемая POST XHR с https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerLimiteDeProductos
POST XHR не работает для меня без заголовка cookie. Таким образом, я должен добавить дополнительную HEAD XHR для извлечения ASP.NET_SessionId
сначала cookie, версия сервера XMLHTTP, используемая для управления cookie. Единственными заголовками ответа, возвращающими cookie, является GET XHR с https://www.disco.com.ar/Login/PreHome.aspx
- Полученная строка JSON должна быть проанализирована дважды, так как она содержит второй JSON полезной нагрузки, заключенный в
d
Свойство первого JSON. - Преобразовать разобранный объект JSON в табличную форму, представленную в 2d-массивах.
- Выведите массивы на лист. Вы можете выполнить дальнейшую обработку с прямым доступом к массивам.
Для веб-страницы, показанной ниже:
Вывод для меня выглядит следующим образом:
Поместите приведенный ниже код в стандартный модуль VBA Project:
Option Explicit
Sub GetData()
Dim sCookie As String
Dim sPayLoad As String
Dim sCont As String
Dim vJSON As Variant
Dim sState As String
Dim y As Long
Dim sSection As Variant
Dim aData()
Dim aHeader()
' Get cookie from the site
With CreateObject("MSXML2.ServerXMLHTTP")
.Open "HEAD", "https://www.disco.com.ar/Login/PreHome.aspx", False
.Send
sCookie = .getAllResponseHeaders
End With
sCookie = Split(sCookie, "Set-Cookie: ", 2)(1)
sCookie = Split(sCookie, ";", 2)(0)
' Retrieve JSON data
sPayLoad = "{IdMenu:""21063"",textoBusqueda:"""", producto:"""", marca:"""", " & _
"pager:"""", ordenamiento:0, precioDesde:"""", precioHasta:""""}"
With CreateObject("MSXML2.ServerXMLHTTP")
.Open "POST", "https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerArticulosPorDescripcionMarcaFamiliaLevex", False
.SetRequestHeader "Accept", "application/json, text/javascript, */*; q=0.01"
.SetRequestHeader "Content-Type", "application/json; charset=utf-8"
.SetRequestHeader "Content-Length", Len(sPayLoad)
.SetRequestHeader "Cookie", sCookie
.Send CStr(sPayLoad)
sCont = .responseText
End With
' Parse JSON response
JSON.Parse sCont, vJSON, sState
sCont = vJSON.Item("d")
JSON.Parse sCont, vJSON, sState
' Output tables
Sheets(1).Cells.Delete
y = 1
For Each sSection In Array("Tipo", "Marca", "Precio", "ResultadosBusquedaLevex", "ArticulosSugereridos")
JSON.ToArray vJSON.Item(sSection), aData, aHeader
With Sheets(1)
.Cells(y, 1).Value = sSection
OutputArray .Cells(y + 1, 1), aHeader
Output2DArray .Cells(y + 2, 1), aData
.Cells.Columns.AutoFit
End With
y = y + UBound(aData, 1) + 4
Next
End Sub
Sub OutputArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
1, _
UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Создайте еще один стандартный модуль, назовите его JSON
и поместите в него приведенный ниже код, этот код обеспечивает функциональность обработки JSON:
Option Explicit
Private sBuffer As String
Private oTokens As Object
Private oRegEx As Object
Private bMatch As Boolean
Private oChunks As Object
Private oHeader As Object
Private aData() As Variant
Private i As Long
Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
' Backus–Naur form JSON parser implementation based on RegEx
' Input:
' sSample - source JSON string
' Output:
' vJson - created object or array to be returned as result
' sState - string Object|Array|Error depending on processing
sBuffer = sSample
Set oTokens = CreateObject("Scripting.Dictionary")
Set oRegEx = CreateObject("VBScript.RegExp")
With oRegEx ' Patterns based on specification http://www.json.org/
.Global = True
.MultiLine = True
.IgnoreCase = True ' Unspecified True, False, Null accepted
.Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
Tokenize "s"
.Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
Tokenize "d"
.Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
Tokenize "c"
.Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
Tokenize "n"
.Pattern = "\s+"
sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
.MultiLine = False
Do
bMatch = False
.Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
Tokenize "p"
.Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
Tokenize "o"
.Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
Tokenize "a"
Loop While bMatch
.Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
Retrieve sBuffer, vJSON
sState = IIf(IsObject(vJSON), "Object", "Array")
Else
vJSON = Null
sState = "Error"
End If
End With
Set oTokens = Nothing
Set oRegEx = Nothing
End Sub
Private Sub Tokenize(sType)
Dim aContent() As String
Dim lCopyIndex As Long
Dim i As Long
Dim sKey As String
With oRegEx.Execute(sBuffer)
If .Count = 0 Then Exit Sub
ReDim aContent(0 To .Count - 1)
lCopyIndex = 1
For i = 0 To .Count - 1
With .Item(i)
sKey = "<" & oTokens.Count & sType & ">"
oTokens(sKey) = .Value
aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
lCopyIndex = .FirstIndex + .Length + 1
End With
Next
End With
sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
bMatch = True
End Sub
Private Sub Retrieve(sTokenKey, vTransfer)
Dim sTokenValue As String
Dim sName As String
Dim vValue As Variant
Dim aTokens() As String
Dim i As Long
sTokenValue = oTokens(sTokenKey)
With oRegEx
.Global = True
Select Case Left(Right(sTokenKey, 2), 1)
Case "o"
Set vTransfer = CreateObject("Scripting.Dictionary")
aTokens = Split(sTokenValue, "<")
For i = 1 To UBound(aTokens)
Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
Next
Case "p"
aTokens = Split(sTokenValue, "<", 4)
Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
If IsObject(vValue) Then
Set vTransfer(sName) = vValue
Else
vTransfer(sName) = vValue
End If
Case "a"
aTokens = Split(sTokenValue, "<")
If UBound(aTokens) = 0 Then
vTransfer = Array()
Else
ReDim vTransfer(0 To UBound(aTokens) - 1)
For i = 1 To UBound(aTokens)
Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
If IsObject(vValue) Then
Set vTransfer(i - 1) = vValue
Else
vTransfer(i - 1) = vValue
End If
Next
End If
Case "n"
vTransfer = sTokenValue
Case "s"
vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
"\""", """"), _
"\\", "\"), _
"\/", "/"), _
"\b", Chr(8)), _
"\f", Chr(12)), _
"\n", vbLf), _
"\r", vbCr), _
"\t", vbTab)
.Global = False
.Pattern = "\\u[0-9a-fA-F]{4}"
Do While .Test(vTransfer)
vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
Loop
Case "d"
vTransfer = Evaluate(sTokenValue)
Case "c"
Select Case LCase(sTokenValue)
Case "true"
vTransfer = True
Case "false"
vTransfer = False
Case "null"
vTransfer = Null
End Select
End Select
End With
End Sub
Function Serialize(vJSON As Variant) As String
Set oChunks = CreateObject("Scripting.Dictionary")
SerializeElement vJSON, ""
Serialize = Join(oChunks.Items(), "")
Set oChunks = Nothing
End Function
Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)
Dim aKeys() As Variant
Dim i As Long
With oChunks
Select Case VarType(vElement)
Case vbObject
If vElement.Count = 0 Then
.Item(.Count) = "{}"
Else
.Item(.Count) = "{" & vbCrLf
aKeys = vElement.Keys
For i = 0 To UBound(aKeys)
.Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
SerializeElement vElement(aKeys(i)), sIndent & vbTab
If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
.Item(.Count) = vbCrLf
Next
.Item(.Count) = sIndent & "}"
End If
Case Is >= vbArray
If UBound(vElement) = -1 Then
.Item(.Count) = "[]"
Else
.Item(.Count) = "[" & vbCrLf
For i = 0 To UBound(vElement)
.Item(.Count) = sIndent & vbTab
SerializeElement vElement(i), sIndent & vbTab
If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
.Item(.Count) = vbCrLf
Next
.Item(.Count) = sIndent & "]"
End If
Case vbInteger, vbLong
.Item(.Count) = vElement
Case vbSingle, vbDouble
.Item(.Count) = Replace(vElement, ",", ".")
Case vbNull
.Item(.Count) = "null"
Case vbBoolean
.Item(.Count) = IIf(vElement, "true", "false")
Case Else
.Item(.Count) = """" & _
Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
"\", "\\"), _
"""", "\"""), _
"/", "\/"), _
Chr(8), "\b"), _
Chr(12), "\f"), _
vbLf, "\n"), _
vbCr, "\r"), _
vbTab, "\t") & _
""""
End Select
End With
End Sub
Function ToString(vJSON As Variant) As String
Select Case VarType(vJSON)
Case vbObject, Is >= vbArray
Set oChunks = CreateObject("Scripting.Dictionary")
ToStringElement vJSON, ""
oChunks.Remove 0
ToString = Join(oChunks.Items(), "")
Set oChunks = Nothing
Case vbNull
ToString = "Null"
Case vbBoolean
ToString = IIf(vJSON, "True", "False")
Case Else
ToString = CStr(vJSON)
End Select
End Function
Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)
Dim aKeys() As Variant
Dim i As Long
With oChunks
Select Case VarType(vElement)
Case vbObject
If vElement.Count = 0 Then
.Item(.Count) = "''"
Else
.Item(.Count) = vbCrLf
aKeys = vElement.Keys
For i = 0 To UBound(aKeys)
.Item(.Count) = sIndent & aKeys(i) & ": "
ToStringElement vElement(aKeys(i)), sIndent & vbTab
If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
Next
End If
Case Is >= vbArray
If UBound(vElement) = -1 Then
.Item(.Count) = "''"
Else
.Item(.Count) = vbCrLf
For i = 0 To UBound(vElement)
.Item(.Count) = sIndent & i & ": "
ToStringElement vElement(i), sIndent & vbTab
If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
Next
End If
Case vbNull
.Item(.Count) = "Null"
Case vbBoolean
.Item(.Count) = IIf(vElement, "True", "False")
Case Else
.Item(.Count) = CStr(vElement)
End Select
End With
End Sub
Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)
' Input:
' vJSON - Array or Object which contains rows data
' Output:
' aData - 2d array representing JSON data
' aHeader - 1d array of property names
Dim sName As Variant
Set oHeader = CreateObject("Scripting.Dictionary")
Select Case VarType(vJSON)
Case vbObject
If vJSON.Count > 0 Then
ReDim aData(0 To vJSON.Count - 1, 0 To 0)
oHeader("#") = 0
i = 0
For Each sName In vJSON
aData(i, 0) = "#" & sName
ToArrayElement vJSON(sName), ""
i = i + 1
Next
Else
ReDim aData(0 To 0, 0 To 0)
End If
Case Is >= vbArray
If UBound(vJSON) >= 0 Then
ReDim aData(0 To UBound(vJSON), 0 To 0)
For i = 0 To UBound(vJSON)
ToArrayElement vJSON(i), ""
Next
Else
ReDim aData(0 To 0, 0 To 0)
End If
Case Else
ReDim aData(0 To 0, 0 To 0)
aData(0, 0) = ToString(vJSON)
End Select
aHeader = oHeader.Keys()
Set oHeader = Nothing
aRows = aData
Erase aData
End Sub
Private Sub ToArrayElement(vElement As Variant, sFieldName As String)
Dim sName As Variant
Dim j As Long
Select Case VarType(vElement)
Case vbObject ' collection of objects
For Each sName In vElement
ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
Next
Case Is >= vbArray ' collection of arrays
For j = 0 To UBound(vElement)
ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
Next
Case Else
If Not oHeader.Exists(sFieldName) Then
oHeader(sFieldName) = oHeader.Count
If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
End If
j = oHeader(sFieldName)
aData(i, j) = ToString(vElement)
End Select
End Sub
Проверьте VBA-JSON-анализатор на GitHub для получения последней версии анализатора JSON (импортируйте модуль JSON.bas в проект VBA для обработки JSON).
Я бы посоветовал вам использовать Selenium с Python. Настройка займет некоторое время, но как только это будет сделано, у вас будет идеальный инструмент для работы. Это позволит вам очищать любой веб-сайт, который вам нужен, используя простой синтаксис Python, используя нужный веб-браузер (Firefox или Chrome), читая и взаимодействуя с javascript. Я использую его ежедневно.