尝试从网页上的图像中提取元数据时,一直返回 {},为什么?
When trying to extract meta data out of images on webpages, keeps returning {}, why?
我看过 exifread 文档,它说它作为字典返回,但问题是它 returns 除了 {} 什么都没有,我不知道那是否意味着没有图片中的元数据,或者我犯了一个愚蠢的错误,无论如何我花了很多时间查看我的代码和文档,但仍然找不到解决方案,任何帮助将不胜感激:)
代码:
import exifread
import colorama
import urllib2
import urllib
import random
import time
import bs4
import sys
def get_images(target):
colorama.init()
print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...")
req = urllib2.Request(target)
resp = urllib2.urlopen(req)
page = resp.read()
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if "www" in src or "http" in src or "https" in src:
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(src, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
else:
s = target + src
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(s, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
return
def main():
target = raw_input("Enter the target: ")
print ("\n")
get_images(target)
time.sleep(5)
sys.exit()
if __name__ == "__main__":
main()
问题是您没有传递 base url,您需要传递主机然后将其加入 src 除非你从 src 属性中得到一个绝对值 url。
以下代码演示了一个工作示例,我使用 requests 代替 urllib 但逻辑是相同的:
import bs4
import sys
import os
import requests
from urlparse import urljoin
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img", src=True):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
tags = exifread.process_file(f,"rb")
print (tags)
def main():
target ="http://www.exiv2.org/sample.html"
# need base to join to relative src
base = "http://www.exiv2.org/"
get_images(target, base)
if __name__ == "__main__":
main()
您将获得页面上包含以下内容的一张图片的 exif 数据:
PIL 示例:
import bs4
import os
import requests
from urlparse import urljoin
import PIL.Image
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
try:
img = PIL.Image.open(f)
exif_data = img._getexif()
print(exif_data)
except AttributeError as e:
print("No exif data for {}".format(name))
os.remove(name)
os.remove(name)
将删除没有 exif 数据的文件,如果您不希望发生这种情况,请将其删除。
我看过 exifread 文档,它说它作为字典返回,但问题是它 returns 除了 {} 什么都没有,我不知道那是否意味着没有图片中的元数据,或者我犯了一个愚蠢的错误,无论如何我花了很多时间查看我的代码和文档,但仍然找不到解决方案,任何帮助将不胜感激:)
代码:
import exifread
import colorama
import urllib2
import urllib
import random
import time
import bs4
import sys
def get_images(target):
colorama.init()
print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...")
req = urllib2.Request(target)
resp = urllib2.urlopen(req)
page = resp.read()
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if "www" in src or "http" in src or "https" in src:
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(src, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
else:
s = target + src
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(s, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
return
def main():
target = raw_input("Enter the target: ")
print ("\n")
get_images(target)
time.sleep(5)
sys.exit()
if __name__ == "__main__":
main()
问题是您没有传递 base url,您需要传递主机然后将其加入 src 除非你从 src 属性中得到一个绝对值 url。
以下代码演示了一个工作示例,我使用 requests 代替 urllib 但逻辑是相同的:
import bs4
import sys
import os
import requests
from urlparse import urljoin
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img", src=True):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
tags = exifread.process_file(f,"rb")
print (tags)
def main():
target ="http://www.exiv2.org/sample.html"
# need base to join to relative src
base = "http://www.exiv2.org/"
get_images(target, base)
if __name__ == "__main__":
main()
您将获得页面上包含以下内容的一张图片的 exif 数据:
PIL 示例:
import bs4
import os
import requests
from urlparse import urljoin
import PIL.Image
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
try:
img = PIL.Image.open(f)
exif_data = img._getexif()
print(exif_data)
except AttributeError as e:
print("No exif data for {}".format(name))
os.remove(name)
os.remove(name)
将删除没有 exif 数据的文件,如果您不希望发生这种情况,请将其删除。