最近在尝试学习网抓,遇到了待汉字的链接出错的问题。通过浏览器直接输入带汉字的链接是可以解析出来并返回正确页面的,而通过程序却总是返回找不到页面的错误。因而怀疑是汉字编码转码的问题。通过shell命令使用chrome可以打开,但shell IE浏览器却无法打开。使用XMLHTTP、internetexplorer、Webbrowser控件均是乱码。可以基本确定是发送的汉字没有正确转码的问题了。VB发送的字符串是UTF-8,而页面接受的最终数据是GB2312。附上转化代码。
Function gb2utf(szInput)
Dim wch, uch, szRet
Dim x
Dim nAsc, nAsc2, nAsc3
If szInput = "" Then
unic = szInput
Exit Function
End If
For x = 1 To Len(szInput)
wch = Mid(szInput, x, 1)
nAsc = AscW(wch)
If nAsc < 0 Then nAsc = nAsc + 65536
If (nAsc And &HFF80) = 0 Then
szRet = szRet & wch
Else
If (nAsc And &HF000) = 0 Then
uch = "%" & Hex(((nAsc \ 2 ^ 6)) Or &HC0) & Hex(nAsc And &H3F Or &H80)
szRet = szRet & uch
Else
uch = "%" & Hex((nAsc \ 2 ^ 12) Or &HE0) & "%" & _
Hex((nAsc \ 2 ^ 6) And &H3F Or &H80) & "%" & _
Hex(nAsc And &H3F Or &H80)
szRet = szRet & uch
End If
End If
Next
gb2utf = szRet
End Function
Function utf2gb(UTFStr)
For Dig = 1 To Len(UTFStr)
If Mid(UTFStr, Dig, 1) = "%" Then
If Len(UTFStr) >= Dig + 8 Then
GBStr = GBStr & ConvChinese(Mid(UTFStr, Dig, 9))
Dig = Dig + 8
Else
GBStr = GBStr & Mid(UTFStr, Dig, 1)
End If
Else
GBStr = GBStr & Mid(UTFStr, Dig, 1)
End If
Next
utf2gb = GBStr
End Function
Function ConvChinese(x)
A = Split(Mid(x, 2), "%")
i = 0
j = 0
For i = 0 To UBound(A)
A(i) = c16to2(A(i))
Next
For i = 0 To UBound(A) - 1
DigS = InStr(A(i), "0")
Unicode = ""
For j = 1 To DigS - 1
If j = 1 Then
A(i) = Right(A(i), Len(A(i)) - DigS)
Unicode = Unicode & A(i)
Else
i = i + 1
A(i) = Right(A(i), Len(A(i)) - 2)
Unicode = Unicode & A(i)
End If
Next
If Len(c2to16(Unicode)) = 4 Then
ConvChinese = ConvChinese & ChrW(Int("&H" & c2to16(Unicode)))
Else
ConvChinese = ConvChinese & Chr(Int("&H" & c2to16(Unicode)))
End If
Next
End Function
Function c2to16(x)
i = 1
For i = 1 To Len(x) Step 4
c2to16 = c2to16 & Hex(c2to10(Mid(x, i, 4)))
Next
End Function
Function c2to10(x)
c2to10 = 0
If x = "0" Then Exit Function
i = 0
For i = 0 To Len(x) - 1
If Mid(x, Len(x) - i, 1) = "1" Then c2to10 = c2to10 + 2 ^ (i)
Next
End Function
Function c16to2(x)
i = 0
For i = 1 To Len(Trim(x))
tempstr = c10to2(CInt(Int("&h" & Mid(x, i, 1))))
Do While Len(tempstr) < 4
tempstr = "0" & tempstr
Loop
c16to2 = c16to2 & tempstr
Next
End Function
Function c10to2(x)
mysign = Sgn(x)
x = Abs(x)
DigS = 1
Do
If x < 2 ^ DigS Then
Exit Do
Else
DigS = DigS + 1
End If
Loop
tempnum = x
i = 0
For i = DigS To 1 Step -1
If tempnum >= 2 ^ (i - 1) Then
tempnum = tempnum - 2 ^ (i - 1)
c10to2 = c10to2 & "1"
Else
c10to2 = c10to2 & "0"
End If
Next
If mysign = -1 Then c10to2 = "-" & c10to2
End Function
然后解决页面返回汉字乱码的问题。通过简单的strResponse = StrConv(xmlHttp.responseBody, vbUnicode)无法正确解码。于是只能手动转了,附上代码。页面是GB2312需要转换成UTF-8。
Private Sub testCode()
'测试
Debug.Print GetHtml("http://www.baidu.com/")
End Sub
Public Function GetHtml(url As String)
Dim xmlHttp As Object
Set xmlHttp = CreateObject("Microsoft.XMLHTTP")'
xmlHttp.Open "GET", url, True
' xmlHttp.Open "POST", url, True
' xmlHttp.setRequestHeader "Content-Type", "application/x-www-form-urlencoded;charset=utf-8"
xmlHttp.send (Null)
While xmlHttp.ReadyState <> 4
DoEvents
Wend
GetHtml = BytesToBstr(xmlHttp.responseBody)
End Function
Public Function PostHtml(url As String, val As String)
Dim xmlHttp As Object
Set xmlHttp = CreateObject("Microsoft.XMLHTTP")
' xmlHttp.Open "GET", url, True
xmlHttp.Open "POST", url, True
xmlHttp.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
xmlHttp.send (val)
While xmlHttp.ReadyState <> 4
DoEvents
Wend
PostHtml = BytesToBstr(xmlHttp.responseBody)
End Function
Public Function BytesToBstr(Bytes)
Dim Unicode As String
If IsUTF8(Bytes) Then '如果不是UTF-8编码则按照GB2312来处理
Unicode = "UTF-8"
Else
Unicode = "GB2312"
End If
Dim objstream As Object
Set objstream = CreateObject("ADODB.Stream")
With objstream
.Type = 1
.Mode = 3
.Open
.Write Bytes
.Position = 0
.Type = 2
.Charset = Unicode
BytesToBstr = .ReadText
.Close
End With
End Function
'判断网页编码函数
Private Function IsUTF8(Bytes) As Boolean
Dim i As Long, AscN As Long, Length As Long
Length = UBound(Bytes) + 1
If Length < 3 Then
IsUTF8 = False
Exit Function
ElseIf Bytes(0) = &HEF And Bytes(1) = &HBB And Bytes(2) = &HBF Then
IsUTF8 = True
Exit Function
End If
Do While i <= Length - 1
If Bytes(i) < 128 Then
i = i + 1
AscN = AscN + 1
ElseIf (Bytes(i) And &HE0) = &HC0 And (Bytes(i + 1) And &HC0) = &H80 Then
i = i + 2
ElseIf i + 2 < Length Then
If (Bytes(i) And &HF0) = &HE0 And (Bytes(i + 1) And &HC0) = &H80 And (Bytes(i + 2) And &HC0) = &H80 Then
i = i + 3
Else
IsUTF8 = False
Exit Function
End If
Else
IsUTF8 = False
Exit Function
End If
Loop
If AscN = Length Then
IsUTF8 = False
Else
IsUTF8 = True
End If
End Function