15
07/2014
c#爬取网站的方法
方法1、用自带的webclient,代码如下:
/**
* 用webclient爬取数据
*/
WebClient client = new WebClient();
// Add a user agent header in case the
// requested URI contains a query.
client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
Stream data = client.OpenRead("http://www.baidu.com");
StreamReader reader = new StreamReader(data);
string s = reader.ReadToEnd();
Console.WriteLine(s);
data.Close();
reader.Close();
Console.ReadKey();
return;方法2、用httpwebrequest爬取数据,代码如下:
/**
* 用httpwebrequest爬取数据
*/
string m_html="";
int m_pagesize=0;
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create("http://www.baidu.com/");
myReq.Accept = @"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
myReq.Method = "GET";
myReq.Headers.Add(HttpRequestHeader.AcceptLanguage, @"zh-CN,zh;q=0.8");
myReq.UserAgent = @"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36";
myReq.KeepAlive = true;
HttpWebResponse rsps = (HttpWebResponse)myReq.GetResponse();
Stream sm = rsps.GetResponseStream();
if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)
{
rsps.Close();
return;
}
Encoding cding = Encoding.Default;
string contenttype=rsps.ContentType.ToLower();
int ix = contenttype.IndexOf("charset=");
if (ix != -1)
{
try
{
cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
}
catch
{
cding = Encoding.Default;
}
Console.WriteLine("{0}", cding);
m_html = new StreamReader(sm, cding).ReadToEnd();
}
else
{
m_html = new StreamReader(sm, cding).ReadToEnd();
Regex regex = new Regex("charset=(?<cding>[^=]+)?",RegexOptions.IgnoreCase);
string strcding = regex.Match(m_html).Groups["cding"].Value;
try
{
cding = Encoding.GetEncoding(strcding);
}
catch{
cding = Encoding.Default;
}
byte[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray());
m_html = cding.GetString(bytes);
if (m_html.Split('?').Length > 100)
{
m_html=Encoding.Default.GetString(bytes);
}
}
m_pagesize = m_html.Length;
Console.WriteLine("{0}", m_html);
rsps.Close();
Console.ReadKey();
0 条评论