Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing.
That's wrong.
UEFI standard (version 2.3, May 2009, 5.3.1 GUID Format overview, page
95) defines that LBA is always based on the logical block size. It
means bdev_logical_block_size() (aka BLKSSZGET) for Linux.
This patch removes static sector size from EFI GPT parser.
The problem is reproducible with the latest GNU Parted:
# modprobe scsi_debug dev_size_mb=50 sector_size=4096
# ./parted /dev/sdb print
Model: Linux scsi_debug (scsi)
Disk /dev/sdb: 52.4MB
Sector size (logical/physical): 4096B/4096B
Partition Table: gpt
Number Start End Size File system Name Flags
1 24.6kB 3002kB 2978kB primary
2 3002kB 6001kB 2998kB primary
3 6001kB 9003kB 3002kB primary
# blockdev --rereadpt /dev/sdb
# dmesg | tail -1
sdb: unknown partition table <---- !!!
with this patch:
# blockdev --rereadpt /dev/sdb
# dmesg | tail -1
sdb: sdb1 sdb2 sdb3
Signed-off-by: Karel Zak <[email protected]>
---
fs/partitions/efi.c | 21 +++++++++++++--------
1 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a602..da3f576 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
/************************************************************
* EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
- * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
* efi.[ch] by Matt Domsch <[email protected]>
* Copyright 2000,2001,2002,2004 Dell Inc.
*
@@ -141,7 +143,7 @@ last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
- return (bdev->bd_inode->i_size >> 9) - 1ULL;
+ return (bdev->bd_inode->i_size / bdev_logical_block_size(bdev)) - 1ULL;
}
static inline int
@@ -188,6 +190,7 @@ static size_t
read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
{
size_t totalreadcount = 0;
+ sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
if (!bdev || !buffer || lba > last_lba(bdev))
return 0;
@@ -195,7 +198,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
while (count) {
int copied = 512;
Sector sect;
- unsigned char *data = read_dev_sector(bdev, lba++, §);
+ unsigned char *data = read_dev_sector(bdev, n++, §);
if (!data)
break;
if (copied > count)
@@ -601,6 +604,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
+ unsigned ssz = bdev_logical_block_size(bdev) / 512;
if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@@ -611,13 +615,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+ u64 start = le64_to_cpu(ptes[i].starting_lba);
+ u64 size = le64_to_cpu(ptes[i].ending_lba) -
+ le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
if (!is_pte_valid(&ptes[i], last_lba(bdev)))
continue;
- put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
- (le64_to_cpu(ptes[i].ending_lba) -
- le64_to_cpu(ptes[i].starting_lba) +
- 1ULL));
+ put_partition(state, i+1, start * ssz, size * ssz);
/* If this is a RAID volume, tell md */
if (!efi_guidcmp(ptes[i].partition_type_guid,
--
1.6.2.5
The size of EFI GPT header is not static, but whole sector is
allocated for the header. The HeaderSize field must be greater
than 92 (= sizeof(struct gpt_header) and must be less than or
equal to the logical block size.
It means we have to read whole sector with the header, because the
header crc32 checksum is calculated according to HeaderSize.
For more details see UEFI standard (version 2.3, May 2009):
- 5.3.1 GUID Format overview, page 93
- Table 13. GUID Partition Table Header, page 96
Signed-off-by: Karel Zak <[email protected]>
---
fs/partitions/efi.c | 7 ++++---
fs/partitions/efi.h | 8 ++++++--
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index da3f576..58eca76 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -260,15 +260,16 @@ static gpt_header *
alloc_read_gpt_header(struct block_device *bdev, u64 lba)
{
gpt_header *gpt;
+ unsigned ssz = bdev_logical_block_size(bdev);
+
if (!bdev)
return NULL;
- gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+ gpt = kzalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
- if (read_lba(bdev, lba, (u8 *) gpt,
- sizeof (gpt_header)) < sizeof (gpt_header)) {
+ if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0..6998b58 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
#define EFI_PMBR_OSTYPE_EFI 0xEF
#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_BLOCK_SIZE 512
#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
#define GPT_HEADER_REVISION_V1 0x00010000
#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
__le32 num_partition_entries;
__le32 sizeof_partition_entry;
__le32 partition_entry_array_crc32;
- u8 reserved2[GPT_BLOCK_SIZE - 92];
+
+ /* The rest of the logical block is reserved by UEFI and must be zero.
+ * EFI standard handles this by:
+ *
+ * uint8_t reserved2[ BlockSize - 92 ];
+ */
} __attribute__ ((packed)) gpt_header;
typedef struct _gpt_entry_attributes {
--
1.6.2.5
On Fri, Oct 23, 2009 at 12:36:15PM +0200, Karel Zak wrote:
> Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing.
> That's wrong.
Ping? Does anyone care about new disks with non-512byte sectors?
(or fs/partitions is unmaintained area? ;-)
Note that since 2.6.31 kernel properly reports topology information
to userspace and many userspace tools (fdisk, parted, anaconda,
libblkid, mkfs.xfs, mkfs.ext, ...) are able to follow such information.
The current kernel EFI GPT code in not compatible with the latest
userspace and GPT partitions on disks with >512byte sectors will be
*invisible* for Linux kernel.
Karel
> UEFI standard (version 2.3, May 2009, 5.3.1 GUID Format overview, page
> 95) defines that LBA is always based on the logical block size. It
> means bdev_logical_block_size() (aka BLKSSZGET) for Linux.
>
> This patch removes static sector size from EFI GPT parser.
>
> The problem is reproducible with the latest GNU Parted:
>
> # modprobe scsi_debug dev_size_mb=50 sector_size=4096
>
> # ./parted /dev/sdb print
> Model: Linux scsi_debug (scsi)
> Disk /dev/sdb: 52.4MB
> Sector size (logical/physical): 4096B/4096B
> Partition Table: gpt
>
> Number Start End Size File system Name Flags
> 1 24.6kB 3002kB 2978kB primary
> 2 3002kB 6001kB 2998kB primary
> 3 6001kB 9003kB 3002kB primary
>
> # blockdev --rereadpt /dev/sdb
> # dmesg | tail -1
> sdb: unknown partition table <---- !!!
>
> with this patch:
>
> # blockdev --rereadpt /dev/sdb
> # dmesg | tail -1
> sdb: sdb1 sdb2 sdb3
--
Karel Zak <[email protected]>
On Mon, Nov 09 2009, Karel Zak wrote:
> On Fri, Oct 23, 2009 at 12:36:15PM +0200, Karel Zak wrote:
> > Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing.
> > That's wrong.
>
> Ping? Does anyone care about new disks with non-512byte sectors?
> (or fs/partitions is unmaintained area? ;-)
Can you repost them, please? I don't seem to have them, most likely got
lost in the jls2009/ks travel schedule.
--
Jens Axboe
On Mon, Nov 09, 2009 at 02:08:27PM +0100, Karel Zak wrote:
> On Fri, Oct 23, 2009 at 12:36:15PM +0200, Karel Zak wrote:
> > Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing.
> > That's wrong.
>
> Ping? Does anyone care about new disks with non-512byte sectors?
> (or fs/partitions is unmaintained area? ;-)
[as you must have noticed, I hardly do any kernel work any more;
probably there aren't many who know more about the ugly details
of DOS-type partition tables, but on the other hand nobody needs
such knowledge either]
> The current kernel EFI GPT code in not compatible with the latest
> userspace and GPT partitions on disks with >512byte sectors will be
> *invisible* for Linux kernel.
Yes, I see that the current UEFI standard requires the use of the
disks block size. Roughly speaking I agree with your patch.
(Just read some current kernel code. The old hardsect_size stuff was
renamed to logical_block_size - funny, originally that was precisely
what hardsect was not.)
static size_t
read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
{
size_t totalreadcount = 0;
sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
if (!bdev || !buffer || lba > last_lba(bdev))
return 0;
while (count) {
int copied = 512;
Sector sect;
unsigned char *data = read_dev_sector(bdev, n++, §);
if (!data)
break;
if (copied > count)
copied = count;
memcpy(buffer, data, copied);
put_dev_sector(sect);
buffer += copied;
totalreadcount +=copied;
count -= copied;
}
return totalreadcount;
}
Ugly - it looks as if you call read_dev_sector 8 times and each time
do a put_dev_sector afterwards to forget it again. Doesnt that mean
that in order to read a 4096-byte sector the kernel goes to the hardware
8 times?
Andries
On Mon, Nov 09, 2009 at 08:58:32PM +0100, Andries E. Brouwer wrote:
> static size_t
> read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
> {
> size_t totalreadcount = 0;
> sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
>
> if (!bdev || !buffer || lba > last_lba(bdev))
> return 0;
>
> while (count) {
> int copied = 512;
> Sector sect;
> unsigned char *data = read_dev_sector(bdev, n++, §);
> if (!data)
> break;
> if (copied > count)
> copied = count;
> memcpy(buffer, data, copied);
> put_dev_sector(sect);
> buffer += copied;
> totalreadcount +=copied;
> count -= copied;
> }
> return totalreadcount;
> }
>
> Ugly - it looks as if you call read_dev_sector 8 times and each time
Yes, the function is really not elegant, but it follows the way how
all fs/partitions code reads sectors from disks.
> do a put_dev_sector afterwards to forget it again. Doesnt that mean
> that in order to read a 4096-byte sector the kernel goes to the hardware
> 8 times?
read_dev_sector() works with 512-byte sectors and it's interface to
read_mapping_page() which works with pages from cache.
Karel
--
Karel Zak <[email protected]>