bs4 不会从 worldometers 中删除中国的电晕数据

bs4 doesn't scrape corona data of china from worldometers

好吧,这个问题看起来很奇怪,但是当我为 covid 数据抓取 worldometer 时,它似乎跳过了中国,,中国在蒙古和喀麦隆之间,但不在 cmd 的字典中。请任何人都可以告诉我为什么会这样。 link to website

import requests
from itertools import islice
from bs4 import BeautifulSoup as bs
url = "https://www.worldometers.info/coronavirus/"
r  = requests.get(url)
htmlcontent = r.content
soup = bs(htmlcontent,  "html.parser")

country = soup.find_all("a",class_="mt_a")[:120]
names = ["sno",'Country' , 'Totalcases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious', 'TotCases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop']
tbody = soup.find_all("tbody")[0]
country_info = [a.string if a.string is not None else "" for i in tbody.find_all("tr")[8:] for a in i.find_all("td")[:14] ]
covid_info = {x: {y:z for y, z in zip(names, country_info[ind*len(names):])} for ind, x in enumerate([i.string for i in country])}


print({ k:v for (k,v) in zip([i.string for i in country],[covid_info[i.string]["Tests/1M pop"] for  i in country])})

编辑:我在国家/地区切片中将限制更改为 220,然后最后打印中国,其余顺序相同。虽然我的问题解决了,但是我想知道为什么中国最后好了,其他的都好了。

# importing modules
import requests
from bs4 import BeautifulSoup
 
# URL for scraping data
url = 'https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/'
 
# get URL html
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
 
data = []
 
# soup.find_all('td') will scrape every
# element in the url's table
data_iterator = iter(soup.find_all('td'))
 
# data_iterator is the iterator of the table
# This loop will keep repeating till there is
# data available in the iterator
while True:
    try:
        country = next(data_iterator).text
        confirmed = next(data_iterator).text
        deaths = next(data_iterator).text
        continent = next(data_iterator).text
 
        # For 'confirmed' and 'deaths',
        # make sure to remove the commas
        # and convert to int
        data.append((
            country,
            int(confirmed.replace(',','')),
            int(deaths.replace(',','')),
            continent
        ))
 
    # StopIteration error is raised when
    # there are no more elements left to
    # iterate through
    except StopIteration:
        break
 
# Sort the data by the number of confirmed cases
data.sort(key = lambda row: row[1], reverse = True)

根据 https://www.geeksforgeeks.org/scraping-covid-19-statistics-using-beautifulsoup/ 它在数据列中包含中国...

所以我认为您可以使用该代码并更改响应格式的类型。

要获取“Tot Cases”列下的所有数据,您可以看到此示例使用 CSS select 或 select 正确的标签,并映射正确的“ countries”和“Tot Cases”

import requests
from bs4 import BeautifulSoup as bs

url = "https://www.worldometers.info/coronavirus/"
r = requests.get(url)
htmlcontent = r.content
soup = bs(htmlcontent, "html.parser")

cases = (
    case.text
    for case in soup.select(
        "#main_table_countries_today tr:nth-of-type(n+5) td[style]:nth-of-type(11)"
    )
    if case.text != ""
)


countries = (
    country.text
    for country in soup.select(
        "#main_table_countries_today a.mt_a, #main_table_countries_today tr.odd:nth-of-type(7) td:nth-of-type(2)"
    )
)


print(dict(zip(countries, cases)))

输出:

{'USA': '103,348', 'India': '21,487', 'Brazil': '83,562', 'France': '88,015', 'Turkey': '63,021', 'Russia': '36,418', 'UK': '67,859', 'Argentina': '93,395', 'Italy': '70,442', 'Colombia': '76,205', 'Spain': '80,335', 'Germany': '44,379', 'Iran': '36,401', 'Poland': '76,145', 'Mexico': '19,009', 'Ukraine': '51,278', 'Peru': '60,655', 'Indonesia': '7,202', 'South Africa': '30,374', 'Netherlands': '97,777', 'Czechia': '155,294', 'Chile': '78,706', 'Canada': '37,005', 'Philippines': '12,245', 'Iraq': '31,333', 'Sweden': '106,755', 'Romania': '56,518', 'Belgium': '92,718', 'Pakistan': '4,214', 'Portugal': '85,076', 'Bangladesh': '5,122', 'Israel': '90,055', 'Hungary': '83,787', 'Japan': '6,217', 'Jordan': '72,468', 'Serbia': '82,239', 'Switzerland': '80,502', 'Malaysia': '21,254', 'Austria': '71,729', 'Nepal': '20,953', 'UAE': '61,169', 'Lebanon': '79,956', 'Morocco': '14,107', 'Saudi Arabia': '13,420', 'Ecuador': '24,939', 'Bulgaria': '61,035', 'Bolivia': '35,595', 'Greece': '40,327', 'Belarus': '43,475', 'Kazakhstan': '21,516', 'Paraguay': '56,128', 'Panama': '89,756', 'Slovakia': '71,642', 'Tunisia': '31,930', 'Croatia': '88,016', 'Georgia': '90,131', 'Uruguay': '101,389', 'Costa Rica': '68,229', 'Kuwait': '77,890', 'Azerbaijan': '32,804', 'Dominican Republic': '28,923', 'Palestine': '59,940', 'Denmark': '50,182', 'Guatemala': '15,353', 'Lithuania': '103,661', 'Egypt': '2,656', 'Ethiopia': '2,337', 'Ireland': '53,907', 'Bahrain': '149,260', 'Venezuela': '9,148', 'Slovenia': '123,596', 'Moldova': '63,639', 'Honduras': '25,171', 'Oman': '47,415', 'Sri Lanka': '11,148', 'Armenia': '75,533', 'Qatar': '78,638', 'Thailand': '3,118', 'Bosnia and Herzegovina': '62,806', 'Libya': '27,401', 'Kenya': '3,261', 'Cuba': '14,824', 'Nigeria': '792', 'North Macedonia': '74,701', 'S. Korea': '2,946', 'Myanmar': '2,703', 'Latvia': '73,317', 'Algeria': '3,039', 'Albania': '46,087', 'Estonia': '98,543', 'Norway': '23,667', 'Zambia': '6,835', 'Kyrgyzstan': '17,356', 'Uzbekistan': '3,124', 'Afghanistan': '2,613', 'Montenegro': '159,319', 'Ghana': '2,994', 'Finland': '17,000', 'Mongolia': '28,101', 'Cameroon': '2,955', 'El Salvador': '11,767', 'Cyprus': '60,520', 'Namibia': '28,301', 'Mozambique': '2,255', 'Maldives': '130,450', 'Uganda': '1,518', 'Luxembourg': '110,913', 'Botswana': '26,383', 'Singapore': '10,588', 'Jamaica': '16,703', 'Ivory Coast': '1,779', 'Cambodia': '2,521', 'Senegal': '2,470', 'Madagascar': '1,485', 'Zimbabwe': '2,762', 'DRC': '410', 'Angola': '1,111', 'Sudan': '811', 'Malawi': '1,777', 'Cabo Verde': '56,895', 'Malta': '69,100', 'Rwanda': '2,301', 'Trinidad and Tobago': '21,735', 'Australia': '1,176', 'Réunion': '31,549', 'French Guiana': '86,406', 'Syria': '1,400', 'Gabon': '10,924', 'Guinea': '1,740', 'Mauritania': '4,261', 'Suriname': '33,375', 'Mayotte': '69,461', 'Guyana': '24,010', 'French Polynesia': '67,132', 'Eswatini': '16,054', 'Haiti': '1,494', 'Papua New Guinea': '1,859', 'Guadeloupe': '41,860', 'Somalia': '910', 'Mali': '691', 'Seychelles': '142,750', 'Taiwan': '587', 'Andorra': '178,874', 'Togo': '1,620', 'Burkina Faso': '628', 'Tajikistan': '1,366', 'Vietnam': '134', 'Belize': '32,143', 'Curaçao': '74,766', 'Congo': '2,177', 'Bahamas': '30,980', 'Martinique': '32,476', 'Hong Kong': '1,573', 'Djibouti': '11,566', 'Aruba': '103,597', 'Lesotho': '5,133', 'South Sudan': '951', 'Timor-Leste': '6,485', 'Equatorial Guinea': '6,011', 'Benin': '655', 'Nicaragua': '1,149', 'CAR': '1,446', 'Yemen': '226', 'Iceland': '19,299', 'Gambia': '2,427', 'Niger': '218', 'Eritrea': '1,492', 'Saint Lucia': '28,254', 'Burundi': '419', 'San Marino': '149,706', 'Chad': '293', 'Sierra Leone': '578', 'Gibraltar': '128,292', 'Channel Islands': '23,522', 'Barbados': '14,045', 'Comoros': '4,404', 'Guinea-Bissau': '1,900', 'Liechtenstein': '79,144', 'Liberia': '548', 'New Zealand': '543', 'Sint Maarten': '59,292', 'Monaco': '64,490', 'Bermuda': '40,351', 'Turks and Caicos': '61,777', 'Sao Tome and Principe': '10,588', 'St. Vincent Grenadines': '19,698', 'Saint Martin': '54,273', 'Laos': '278', 'Fiji': '2,176', 'Bhutan': '2,467', 'Mauritius': '1,335', 'Caribbean Netherlands': '61,607', 'Isle of Man': '18,709', 'Antigua and Barbuda': '12,795', 'St. Barth': '101,454', 'Faeroe Islands': '15,517', 'Cayman Islands': '9,148', 'Tanzania': '8', 'Wallis and Futuna': '40,279', 'Saint Kitts and Nevis': '6,161', 'Brunei ': '571', 'British Virgin Islands': '8,151', 'Dominica': '2,647', 'Grenada': '1,424', 'New Caledonia': '448', 'Anguilla': '7,203', 'Falkland Islands': '17,613', 'Macao': '81', 'Greenland': '862', 'Vatican City': '33,624', 'Saint Pierre Miquelon': '4,508', 'Montserrat': '4,004', 'Solomon Islands': '28', 'Western Sahara': '16', 'Vanuatu': '13', 'Marshall Islands': '67', 'Samoa': '15', 'Saint Helena': '328', 'Micronesia': '9', 'China': '64'}

Edit : I changed the limit to 220 in country slicing and then it prints China at last and rest of them are in the same order. Although my problem is solved but I want to know why China was at last and others are in order.

中国排在最后,因为浏览器中没有 JavaScript 运行,与 requests 一样,您得到的结果与来源 html 相同,其中中国确实是最后一个:

但是,在浏览器中,启用并应用了按总案例降序排序的指令,因此中国移动了位置:

因此,如果您希望结果按相同顺序排列,请确保按该列降序排列,如果包含第一列,请重新编号。

至于为什么中国是源代码的底部html,你需要问开发商。可能是因为原来是benchmark/comparator.