AliKeywordSearch/HttpHelper.cs

383 lines
15 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Net.Security;
using System.Net.Sockets;
using System.Security.Authentication;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
namespace CityCraft
{
public enum HttpMethod
{
GET,
POST
}
public class HttpArgs
{
public string Url { get; set; }
public string Host { get; set; }
public int Port { get; set; }
public string Accept { get; set; }
public string Referer { get; set; }
public string Cookie { get; set; }
public string Data { get; set; }
public string UA { get; set; }
public HttpMethod Method { get; set; }
}
public enum HttpReadyState
{
,
,
,
,
}
public class HttpHelper
{
public HttpReadyState readyState = HttpReadyState.;
public int Status = 0;
public string responseBody = "";
public string responseText = "";
public byte[] responseByte = null;
public HttpArgs args = new HttpArgs();
public string ErrMsg = string.Empty;
/// <summary>
/// 提交方法
/// </summary>
#region HttpWebRequest & HttpWebResponse
/// <summary>
/// Get方法
/// </summary>
/// <param name="geturl">请求地址</param>
/// <param name="cookieser">Cookies存储器</param>
/// <returns>请求返回的Stream</returns>
public void Send(HttpMethod method, string url)
{
readyState = HttpReadyState.;
ParseURL(url);
args.Method = method;
new Thread(new ThreadStart(ReciveData)).Start();
}
public void ReciveData()
{
responseBody = InternalSocketHttp();
readyState = HttpReadyState.;
}
/// <summary>
/// Post方法
/// </summary>
/// <param name="posturl">请求地址</param>
/// <param name="bytes">Post数据</param>
/// <param name="cookieser">Cllkies存储器</param>
/// <returns>请求返回的流</returns>
public string Post(string url,
byte[] bytes,
CookieContainer cookies,
Encoding encoding)
{
return null;
}
/// <summary>
/// 根据Url得到host
/// </summary>
/// <param name="strUrl">url字符串</param>
/// <returns>host字符串</returns>
private void ParseURL(string strUrl)
{
if (args == null)
args = new HttpArgs();
args.Host = "";
args.Port = 80;
args.Referer = "";
args.Cookie = "";
args.Url = "";
args.Accept = "text/html";//,application/xhtml+xml,application/xml,application/json;";
args.UA = "Mozilla/5.0+(Compatible;+Baiduspider/2.0;++http://www.baidu.com/search/spider.html)";
//http://www.alibaba.com/products/Egg_Laying_Block_Machine/1.html
int iIndex = strUrl.IndexOf(@"//");
if (iIndex <= 0)
args = null;
//www.alibaba.com:80/products/Egg_Laying_Block_Machine/1.html
string nohttpurl = strUrl.Substring(iIndex + 2);
string address = nohttpurl;
iIndex = nohttpurl.IndexOf(@"/");
if (iIndex > 0)
{
//www.alibaba.com:80
address = nohttpurl.Substring(0, iIndex);
args.Url = nohttpurl.Substring(iIndex);
}
iIndex = nohttpurl.IndexOf(@":");
if (iIndex > 0)
{
string[] tempargs = address.Trim().Split(char.Parse(":"));
args.Host = tempargs[0];
args.Port = int.Parse(tempargs[1]);
}
else
{
//www.alibaba.com:80
args.Host = address;
args.Port = 80;
}
}
#endregion
#region Socket
string InternalSocketHttp()
{
using (Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
{
try
{
socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.SendTimeout, 1000);
socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReceiveTimeout, 5000);
socket.Connect(args.Host, args.Port);
if (socket.Connected)
{
byte[] buff = ParseHttpArgs();
if (socket.Send(buff) > 0)
{
List<byte> responseBytes = new List<byte>();
byte[] buffer = new byte[1024];
int iNumber = socket.Receive(buffer, buffer.Length, SocketFlags.None);
while (iNumber > 0)//使用了Connection: Close 所以判断长度为0 时停止接受
{
responseBytes.AddRange(new List<byte>(buffer));//添加数据到List
iNumber = socket.Receive(buffer, buffer.Length, SocketFlags.None);//继续接收数据
}
responseByte = responseBytes.ToArray();
readyState = HttpReadyState.;
return ParseResponse(responseByte);
}
}
}
catch (Exception e)
{
ErrMsg = e.Message;
}
return string.Empty;
}
}
private string ParseResponse(byte[] responseBytes)
{
string responseStr = Encoding.UTF8.GetString(responseBytes);
int splitindex = responseStr.IndexOf("\r\n\r\n");
if (splitindex > 0)
{
string responseHeader = responseStr.Substring(0, splitindex);
string responseBody = responseStr.Substring(splitindex + 4);
if (responseHeader.StartsWith("HTTP/1.1 400"))
{
Status = 400;
return string.Empty;
}
else if (responseHeader.StartsWith("HTTP/1.1 404"))
{
Status = 404;
return string.Empty;
}
else if (responseHeader.StartsWith("HTTP/1.1 302") || responseHeader.StartsWith("HTTP/1.1 301"))
{
Status = 302;
int start = responseHeader.ToUpper().IndexOf("LOCATION");
if (start > 0)
{
string temp = responseHeader.Substring(start, responseHeader.Length - start);
string[] sArry = Regex.Split(temp, "\r\n");
args.Url = sArry[0].Remove(0, 10);
if (args.Url == "")
return string.Empty;
return InternalSocketHttp(); //注意302协议需要重定向
}
}
else if (responseHeader.StartsWith("HTTP/1.1 200")) //读取内容
{
Status = 200;
//解压
DecompressWebPage(ref responseBytes, responseHeader);
//转码
responseBody = DecodeWebStringByHttpHeader(responseBytes, responseHeader);
responseBody = DecodeWebStringByHtmlPageInfo(responseBytes, responseBody);
}
splitindex = responseBody.IndexOf("\r\n\r\n");
if (splitindex > 0)
responseBody = responseBody.Substring(splitindex + 4);
else
responseBody = string.Empty;
return responseBody;
}
return string.Empty;
}
#endregion
#region Helper
/// <summary>
/// 解压网页
/// </summary>
/// <param name="responseBytes">网页字节数组含http头</param>
/// <param name="iTotalCount">数组长度</param>
/// <param name="strHeader">Http头字符串</param>
/// <param name="iStart">网页正文开始位置</param>
private void DecompressWebPage(ref byte[] responseBytes, string strHeader)
{
Regex regZip = new Regex(@"Content-Encoding:\s+gzip[^\n]*\r\n", RegexOptions.IgnoreCase);
if (regZip.IsMatch(strHeader))
{
responseBytes = Decompress(responseBytes);
}
}
/// <summary>
/// 解压gzip网页
/// </summary>
/// <param name="szSource">压缩过的字符串字节数组</param>
/// <returns>解压后的字节数组</returns>
private byte[] Decompress(byte[] szSource)
{
MemoryStream msSource = new MemoryStream(szSource);
//DeflateStream 也可以这儿
GZipStream stream = new GZipStream(msSource, CompressionMode.Decompress);
byte[] szTotal = new byte[40 * 1024];
long lTotal = 0;
byte[] buffer = new byte[8];
int iCount = 0;
do
{
iCount = stream.Read(buffer, 0, 8);
if (szTotal.Length <= lTotal + iCount) //放大数组
{
byte[] temp = new byte[szTotal.Length * 10];
szTotal.CopyTo(temp, 0);
szTotal = temp;
}
buffer.CopyTo(szTotal, lTotal);
lTotal += iCount;
} while (iCount != 0);
byte[] szDest = new byte[lTotal];
Array.Copy(szTotal, 0, szDest, 0, lTotal);
return szDest;
}
/// <summary>
/// 根据Http头标记里面的字符编码解析字符串
/// </summary>
/// <param name="responseBytes">网页内容字节数组(除http头以外的内容)</param>
/// <param name="iTotalCount">网页内容字节数组长度</param>
/// <param name="strHeader">http头的字符串</param>
/// <returns>转好的字符串</returns>
private string DecodeWebStringByHttpHeader(byte[] responseBytes, string strHeader)
{
string strResponse = "";
if (strHeader.Contains("charset=GBK") || strHeader.Contains("charset=gb2312"))
{
strResponse = Encoding.GetEncoding("GBK").GetString(responseBytes);
}
else
strResponse = Encoding.UTF8.GetString(responseBytes);
return strResponse;
}
/// <summary>
/// 根据网页meta标记里面的字符编码解析字符串
/// </summary>
/// <param name="responseBytes">网页内容字节数组(除http头以外的内容)</param>
/// <param name="iTotalCount">网页内容字节数组长度</param>
/// <param name="strResponse">网页内容字符串, 可能已经根据其它转码要求转换过的字符串</param>
/// <returns>转好的字符串</returns>
private string DecodeWebStringByHtmlPageInfo(byte[] responseBytes, string strResponse)
{
Regex regGB2312 = new Regex(@"<meta[^>]+Content-Type[^>]+gb2312[^>]*>", RegexOptions.IgnoreCase);
Regex regGBK = new Regex(@"<meta[^>]+Content-Type[^>]+gbk[^>]*>", RegexOptions.IgnoreCase);
Regex regBig5 = new Regex(@"<meta[^>]+Content-Type[^>]+Big5[^>]*>", RegexOptions.IgnoreCase);
if (regGB2312.IsMatch(strResponse) || regGBK.IsMatch(strResponse))
strResponse = Encoding.GetEncoding("GBK").GetString(responseBytes);
if (regBig5.IsMatch(strResponse))
strResponse = Encoding.GetEncoding("Big5").GetString(responseBytes);
return strResponse;
}
private byte[] ParseHttpArgs()
{
StringBuilder bulider = new StringBuilder();
if (args.Method == HttpMethod.POST)
{
bulider.AppendLine(string.Format("POST {0} HTTP/1.1", args.Url));
bulider.AppendLine("Content-Type: application/x-www-form-urlencoded");
}
else
{
bulider.AppendLine(string.Format("GET {0} HTTP/1.1", args.Url));
}
bulider.AppendLine(string.Format("Host: {0}:{1}", args.Host, args.Port));
bulider.AppendLine("User-Agent: " + args.UA);
//"User-Agent: Mozilla/5.0+(Compatible;+Baiduspider/2.0;++http://www.baidu.com/search/spider.html)";Mozilla/5.0 (Windows NT 6.1; IE 9.0)
if (!string.IsNullOrEmpty(args.Referer))
bulider.AppendLine(string.Format("Referer: {0}", args.Referer));
//bulider.AppendLine("Connection: close");
bulider.AppendLine("Connection: Close");
if (!string.IsNullOrEmpty(args.Accept))
bulider.AppendLine(string.Format("Accept: {0}", args.Accept));
if (!string.IsNullOrEmpty(args.Cookie))
bulider.AppendLine(string.Format("Cookie: {0}", args.Cookie));
if (args.Method == HttpMethod.POST)
{
bulider.AppendLine(string.Format("Content-Length: {0}\r\n", Encoding.Default.GetBytes(args.Data).Length));
bulider.Append(args.Data);
}
else
{
bulider.Append("\r\n");
}
string header = bulider.ToString();
return Encoding.Default.GetBytes(header);
}
#endregion
}
public class MilliTimer
{
private static double times { get; set; }
public static void start()
{
times = getTotalMilliseconds();
}
public static double getTimes()
{
return getTotalMilliseconds() - times;
}
public static double getTotalMilliseconds()
{
return DateTime.Now.Subtract(DateTime.Parse("1970-1-1")).TotalMilliseconds;
}
}
}