1
0
mirror of https://e.coding.net/circlecloud/AliKeywordSearch.git synced 2024-11-16 00:48:59 +00:00
AliKeywordSearch/HttpHelper.cs
2015-07-27 14:36:42 +08:00

364 lines
14 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Net.Security;
using System.Net.Sockets;
using System.Security.Authentication;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
namespace EnAliKeywordSearch
{
public class HttpArgs
{
public enum HttpMethod
{
GET,
POST
}
public string Url { get; set; }
public string Host { get; set; }
public int Port { get; set; }
public string Accept { get; set; }
public string Referer { get; set; }
public string Cookie { get; set; }
public string Data { get; set; }
public string UA { get; set; }
public HttpMethod Method { get; set; }
}
public class HttpHelper
{
public static int State = 0;
public static string ErrMsg = string.Empty;
/// <summary>
/// 提交方法
/// </summary>
#region HttpWebRequest & HttpWebResponse
/// <summary>
/// Get方法
/// </summary>
/// <param name="geturl">请求地址</param>
/// <param name="cookieser">Cookies存储器</param>
/// <returns>请求返回的Stream</returns>
public string Get(string url)
{
HttpArgs args = ParseURL(url);
args.Method = HttpArgs.HttpMethod.GET;
string strhtml = InternalSocketHttp(args);
return strhtml;
}
/// <summary>
/// Post方法
/// </summary>
/// <param name="posturl">请求地址</param>
/// <param name="bytes">Post数据</param>
/// <param name="cookieser">Cllkies存储器</param>
/// <returns>请求返回的流</returns>
public string Post(string url,
byte[] bytes,
CookieContainer cookies,
Encoding encoding)
{
return null;
}
/// <summary>
/// 根据Url得到host
/// </summary>
/// <param name="strUrl">url字符串</param>
/// <returns>host字符串</returns>
private HttpArgs ParseURL(string strUrl)
{
HttpArgs args = new HttpArgs();
args.Host = "";
args.Port = 80;
args.Referer = "";
args.Cookie = "";
args.Url = "";
args.Accept = "text/html";//,application/xhtml+xml,application/xml,application/json;";
args.UA = "Mozilla/5.0+(Compatible;+Baiduspider/2.0;++http://www.baidu.com/search/spider.html)";
//http://www.alibaba.com/products/Egg_Laying_Block_Machine/1.html
int iIndex = strUrl.IndexOf(@"//");
if (iIndex <= 0)
return null;
//www.alibaba.com:80/products/Egg_Laying_Block_Machine/1.html
string nohttpurl = strUrl.Substring(iIndex + 2);
string address = nohttpurl;
iIndex = nohttpurl.IndexOf(@"/");
if (iIndex > 0)
{
//www.alibaba.com:80
address = nohttpurl.Substring(0, iIndex);
args.Url = nohttpurl.Substring(iIndex);
}
iIndex = nohttpurl.IndexOf(@":");
if (iIndex > 0)
{
string[] tempargs = nohttpurl.Trim().Split(char.Parse(":"));
args.Host = tempargs[0];
args.Port = int.Parse(tempargs[1]);
}
else
{
//www.alibaba.com:80
args.Host = address;
args.Port = 80;
}
return args;
}
#endregion
#region Socket
string InternalSocketHttp(HttpArgs args)
{
using (Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
{
try
{
socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.SendTimeout, 1000);
socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReceiveTimeout, 5000);
socket.Connect(args.Host, args.Port);
if (socket.Connected)
{
byte[] buff = ParseHttpArgs(args);
if (socket.Send(buff) > 0)
{
List<byte> responseBytes = new List<byte>();
byte[] buffer = new byte[1024];
int iNumber = socket.Receive(buffer, buffer.Length, SocketFlags.None);
while (iNumber > 0)//使用了Connection: Close 所以判断长度为0 时停止接受
{
responseBytes.AddRange(new List<byte>(buffer));//添加数据到List
iNumber = socket.Receive(buffer, buffer.Length, SocketFlags.None);//继续接收数据
}
return ParseResponse(responseBytes.ToArray()/*转换List为数组*/, args);
}
}
}
catch (Exception e)
{
ErrMsg = e.Message;
}
return string.Empty;
}
}
private string ParseResponse(byte[] responseBytes, HttpArgs args)
{
string responseStr = Encoding.UTF8.GetString(responseBytes);
string[] splitStr = responseStr.Split(new char[4] { '\r', '\n', '\r', '\n' }, 2);
if (splitStr.Length == 2)
{
string responseHeader = splitStr[0];
string responseBody = splitStr[1];
if (responseHeader.StartsWith("HTTP/1.1 400 Bad Request"))
{
State = 400;
return string.Empty;
}
else if (responseHeader.StartsWith("HTTP/1.1 404"))
{
State = 404;
return string.Empty;
}
else if (responseHeader.StartsWith("HTTP/1.1 302") || responseHeader.StartsWith("HTTP/1.1 301"))
{
State = 302;
int start = responseHeader.ToUpper().IndexOf("LOCATION");
if (start > 0)
{
string temp = responseHeader.Substring(start, responseHeader.Length - start);
string[] sArry = Regex.Split(temp, "\r\n");
args.Url = sArry[0].Remove(0, 10);
if (args.Url == "")
return string.Empty;
return InternalSocketHttp(args); //注意302协议需要重定向
}
}
else if (responseHeader.StartsWith("HTTP/1.1 200")) //读取内容
{
State = 200;
DecompressWebPage(ref responseBytes, responseHeader);
//转码
responseBody = DecodeWebStringByHttpHeader(responseBytes, responseHeader);
responseBody = DecodeWebStringByHtmlPageInfo(responseBytes, responseBody);
}
string[] responseBodys = responseBody.Split(new char[4] { '\r', '\n', '\r', '\n' }, 2);
if (responseBodys.Length == 2)
responseBody = responseBodys[1];
else
responseBody = string.Empty;
return responseBody;
}
return string.Empty;
}
#endregion
#region Helper
/// <summary>
/// 解压网页
/// </summary>
/// <param name="responseBytes">网页字节数组含http头</param>
/// <param name="iTotalCount">数组长度</param>
/// <param name="strHeader">Http头字符串</param>
/// <param name="iStart">网页正文开始位置</param>
private void DecompressWebPage(ref byte[] responseBytes, string strHeader)
{
Regex regZip = new Regex(@"Content-Encoding:\s+gzip[^\n]*\r\n", RegexOptions.IgnoreCase);
if (regZip.IsMatch(strHeader))
{
responseBytes = Decompress(responseBytes);
}
}
/// <summary>
/// 解压gzip网页
/// </summary>
/// <param name="szSource">压缩过的字符串字节数组</param>
/// <returns>解压后的字节数组</returns>
private byte[] Decompress(byte[] szSource)
{
MemoryStream msSource = new MemoryStream(szSource);
//DeflateStream 也可以这儿
GZipStream stream = new GZipStream(msSource, CompressionMode.Decompress);
byte[] szTotal = new byte[40 * 1024];
long lTotal = 0;
byte[] buffer = new byte[8];
int iCount = 0;
do
{
iCount = stream.Read(buffer, 0, 8);
if (szTotal.Length <= lTotal + iCount) //放大数组
{
byte[] temp = new byte[szTotal.Length * 10];
szTotal.CopyTo(temp, 0);
szTotal = temp;
}
buffer.CopyTo(szTotal, lTotal);
lTotal += iCount;
} while (iCount != 0);
byte[] szDest = new byte[lTotal];
Array.Copy(szTotal, 0, szDest, 0, lTotal);
return szDest;
}
/// <summary>
/// 根据Http头标记里面的字符编码解析字符串
/// </summary>
/// <param name="responseBytes">网页内容字节数组(除http头以外的内容)</param>
/// <param name="iTotalCount">网页内容字节数组长度</param>
/// <param name="strHeader">http头的字符串</param>
/// <returns>转好的字符串</returns>
private string DecodeWebStringByHttpHeader(byte[] responseBytes, string strHeader)
{
string strResponse = "";
if (strHeader.Contains("charset=GBK") || strHeader.Contains("charset=gb2312"))
{
strResponse = Encoding.GetEncoding("GBK").GetString(responseBytes);
}
else
strResponse = Encoding.UTF8.GetString(responseBytes);
return strResponse;
}
/// <summary>
/// 根据网页meta标记里面的字符编码解析字符串
/// </summary>
/// <param name="responseBytes">网页内容字节数组(除http头以外的内容)</param>
/// <param name="iTotalCount">网页内容字节数组长度</param>
/// <param name="strResponse">网页内容字符串, 可能已经根据其它转码要求转换过的字符串</param>
/// <returns>转好的字符串</returns>
private string DecodeWebStringByHtmlPageInfo(byte[] responseBytes, string strResponse)
{
Regex regGB2312 = new Regex(@"<meta[^>]+Content-Type[^>]+gb2312[^>]*>", RegexOptions.IgnoreCase);
Regex regGBK = new Regex(@"<meta[^>]+Content-Type[^>]+gbk[^>]*>", RegexOptions.IgnoreCase);
Regex regBig5 = new Regex(@"<meta[^>]+Content-Type[^>]+Big5[^>]*>", RegexOptions.IgnoreCase);
if (regGB2312.IsMatch(strResponse) || regGBK.IsMatch(strResponse))
strResponse = Encoding.GetEncoding("GBK").GetString(responseBytes);
if (regBig5.IsMatch(strResponse))
strResponse = Encoding.GetEncoding("Big5").GetString(responseBytes);
return strResponse;
}
private byte[] ParseHttpArgs(HttpArgs args)
{
StringBuilder bulider = new StringBuilder();
if (args.Method == HttpArgs.HttpMethod.POST)
{
bulider.AppendLine(string.Format("POST {0} HTTP/1.1", args.Url));
bulider.AppendLine("Content-Type: application/x-www-form-urlencoded");
}
else
{
bulider.AppendLine(string.Format("GET {0} HTTP/1.1", args.Url));
}
bulider.AppendLine(string.Format("Host: {0}:{1}", args.Host, args.Port));
bulider.AppendLine("User-Agent: " + args.UA);
//"User-Agent: Mozilla/5.0+(Compatible;+Baiduspider/2.0;++http://www.baidu.com/search/spider.html)";Mozilla/5.0 (Windows NT 6.1; IE 9.0)
if (!string.IsNullOrEmpty(args.Referer))
bulider.AppendLine(string.Format("Referer: {0}", args.Referer));
//bulider.AppendLine("Connection: close");
bulider.AppendLine("Connection: Close");
if (!string.IsNullOrEmpty(args.Accept))
bulider.AppendLine(string.Format("Accept: {0}", args.Accept));
if (!string.IsNullOrEmpty(args.Cookie))
bulider.AppendLine(string.Format("Cookie: {0}", args.Cookie));
if (args.Method == HttpArgs.HttpMethod.POST)
{
bulider.AppendLine(string.Format("Content-Length: {0}\r\n", Encoding.Default.GetBytes(args.Data).Length));
bulider.Append(args.Data);
}
else
{
bulider.Append("\r\n");
}
string header = bulider.ToString();
return Encoding.Default.GetBytes(header);
}
#endregion
}
public class MilliTimer
{
private static double times { get; set; }
public static void start()
{
times = getTotalMilliseconds();
}
public static double getTimes()
{
return getTotalMilliseconds() - times;
}
public static double getTotalMilliseconds()
{
return DateTime.Now.Subtract(DateTime.Parse("1970-1-1")).TotalMilliseconds;
}
}
}