summaryrefslogtreecommitdiff
path: root/drivers/mfd/htc-i2cpld.c
blob: ebb9cf19e347604f2ac57af7d412aa42ede83b48 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
/*
 *  htc-i2cpld.c
 *  Chip driver for an unknown CPLD chip found on omap850 HTC devices like
 *  the HTC Wizard and HTC Herald.
 *  The cpld is located on the i2c bus and acts as an input/output GPIO
 *  extender.
 *
 *  Copyright (C) 2009 Cory Maccarrone <darkstar6262@gmail.com>
 *
 *  Based on work done in the linwizard project
 *  Copyright (C) 2008-2009 Angelo Arrifano <miknix@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/platform_device.h>
#include <linux/i2c.h>
#include <linux/irq.h>
#include <linux/spinlock.h>
#include <linux/htcpld.h>
#include <linux/gpio.h>
#include <linux/slab.h>

struct htcpld_chip {
	spinlock_t              lock;

	/* chip info */
	u8                      reset;
	u8                      addr;
	struct device           *dev;
	struct i2c_client	*client;

	/* Output details */
	u8                      cache_out;
	struct gpio_chip        chip_out;

	/* Input details */
	u8                      cache_in;
	struct gpio_chip        chip_in;

	u16                     irqs_enabled;
	uint                    irq_start;
	int                     nirqs;

	unsigned int		flow_type;
	/*
	 * Work structure to allow for setting values outside of any
	 * possible interrupt context
	 */
	struct work_struct set_val_work;
};

struct htcpld_data {
	/* irq info */
	u16                irqs_enabled;
	uint               irq_start;
	int                nirqs;
	uint               chained_irq;
	unsigned int       int_reset_gpio_hi;
	unsigned int       int_reset_gpio_lo;

	/* htcpld info */
	struct htcpld_chip *chip;
	unsigned int       nchips;
};

/* There does not appear to be a way to proactively mask interrupts
 * on the htcpld chip itself.  So, we simply ignore interrupts that
 * aren't desired. */
static void htcpld_mask(struct irq_data *data)
{
	struct htcpld_chip *chip = irq_data_get_irq_chip_data(data);
	chip->irqs_enabled &= ~(1 << (data->irq - chip->irq_start));
	pr_debug("HTCPLD mask %d %04x\n", data->irq, chip->irqs_enabled);
}
static void htcpld_unmask(struct irq_data *data)
{
	struct htcpld_chip *chip = irq_data_get_irq_chip_data(data);
	chip->irqs_enabled |= 1 << (data->irq - chip->irq_start);
	pr_debug("HTCPLD unmask %d %04x\n", data->irq, chip->irqs_enabled);
}

static int htcpld_set_type(struct irq_data *data, unsigned int flags)
{
	struct htcpld_chip *chip = irq_data_get_irq_chip_data(data);

	if (flags & ~IRQ_TYPE_SENSE_MASK)
		return -EINVAL;

	/* We only allow edge triggering */
	if (flags & (IRQ_TYPE_LEVEL_LOW|IRQ_TYPE_LEVEL_HIGH))
		return -EINVAL;

	chip->flow_type = flags;
	return 0;
}

static struct irq_chip htcpld_muxed_chip = {
	.name         = "htcpld",
	.irq_mask     = htcpld_mask,
	.irq_unmask   = htcpld_unmask,
	.irq_set_type = htcpld_set_type,
};

/* To properly dispatch IRQ events, we need to read from the
 * chip.  This is an I2C action that could possibly sleep
 * (which is bad in interrupt context) -- so we use a threaded
 * interrupt handler to get around that.
 */
static irqreturn_t htcpld_handler(int irq, void *dev)
{
	struct htcpld_data *htcpld = dev;
	unsigned int i;
	unsigned long flags;
	int irqpin;

	if (!htcpld) {
		pr_debug("htcpld is null in ISR\n");
		return IRQ_HANDLED;
	}

	/*
	 * For each chip, do a read of the chip and trigger any interrupts
	 * desired.  The interrupts will be triggered from LSB to MSB (i.e.
	 * bit 0 first, then bit 1, etc.)
	 *
	 * For chips that have no interrupt range specified, just skip 'em.
	 */
	for (i = 0; i < htcpld->nchips; i++) {
		struct htcpld_chip *chip = &htcpld->chip[i];
		struct i2c_client *client;
		int val;
		unsigned long uval, old_val;

		if (!chip) {
			pr_debug("chip %d is null in ISR\n", i);
			continue;
		}

		if (chip->nirqs == 0)
			continue;

		client = chip->client;
		if (!client) {
			pr_debug("client %d is null in ISR\n", i);
			continue;
		}

		/* Scan the chip */
		val = i2c_smbus_read_byte_data(client, chip->cache_out);
		if (val < 0) {
			/* Throw a warning and skip this chip */
			dev_warn(chip->dev, "Unable to read from chip: %d\n",
				 val);
			continue;
		}

		uval = (unsigned long)val;

		spin_lock_irqsave(&chip->lock, flags);

		/* Save away the old value so we can compare it */
		old_val = chip->cache_in;

		/* Write the new value */
		chip->cache_in = uval;

		spin_unlock_irqrestore(&chip->lock, flags);

		/*
		 * For each bit in the data (starting at bit 0), trigger
		 * associated interrupts.
		 */
		for (irqpin = 0; irqpin < chip->nirqs; irqpin++) {
			unsigned oldb, newb, type = chip->flow_type;

			irq = chip->irq_start + irqpin;

			/* Run the IRQ handler, but only if the bit value
			 * changed, and the proper flags are set */
			oldb = (old_val >> irqpin) & 1;
			newb = (uval >> irqpin) & 1;

			if ((!oldb && newb && (type & IRQ_TYPE_EDGE_RISING)) ||
			    (oldb && !newb && (type & IRQ_TYPE_EDGE_FALLING))) {
				pr_debug("fire IRQ %d\n", irqpin);
				generic_handle_irq(irq);
			}
		}
	}

	/*
	 * In order to continue receiving interrupts, the int_reset_gpio must
	 * be asserted.
	 */
	if (htcpld->int_reset_gpio_hi)
		gpio_set_value(htcpld->int_reset_gpio_hi, 1);
	if (htcpld->int_reset_gpio_lo)
		gpio_set_value(htcpld->int_reset_gpio_lo, 0);

	return IRQ_HANDLED;
}

/*
 * The GPIO set routines can be called from interrupt context, especially if,
 * for example they're attached to the led-gpio framework and a trigger is
 * enabled.  As such, we declared work above in the htcpld_chip structure,
 * and that work is scheduled in the set routine.  The kernel can then run
 * the I2C functions, which will sleep, in process context.
 */
static void htcpld_chip_set(struct gpio_chip *chip, unsigned offset, int val)
{
	struct i2c_client *client;
	struct htcpld_chip *chip_data =
		container_of(chip, struct htcpld_chip, chip_out);
	unsigned long flags;

	client = chip_data->client;
	if (!client)
		return;

	spin_lock_irqsave(&chip_data->lock, flags);
	if (val)
		chip_data->cache_out |= (1 << offset);
	else
		chip_data->cache_out &= ~(1 << offset);
	spin_unlock_irqrestore(&chip_data->lock, flags);

	schedule_work(&(chip_data->set_val_work));
}

static void htcpld_chip_set_ni(struct work_struct *work)
{
	struct htcpld_chip *chip_data;
	struct i2c_client *client;

	chip_data = container_of(work, struct htcpld_chip, set_val_work);
	client = chip_data->client;
	i2c_smbus_read_byte_data(client, chip_data->cache_out);
}

static int htcpld_chip_get(struct gpio_chip *chip, unsigned offset)
{
	struct htcpld_chip *chip_data;
	u8 cache;

	if (!strncmp(chip->label, "htcpld-out", 10)) {
		chip_data = container_of(chip, struct htcpld_chip, chip_out);
		cache = chip_data->cache_out;
	} else if (!strncmp(chip->label, "htcpld-in", 9)) {
		chip_data = container_of(chip, struct htcpld_chip, chip_in);
		cache = chip_data->cache_in;
	} else
		return -EINVAL;

	return (cache >> offset) & 1;
}

static int htcpld_direction_output(struct gpio_chip *chip,
					unsigned offset, int value)
{
	htcpld_chip_set(chip, offset, value);
	return 0;
}

static int htcpld_direction_input(struct gpio_chip *chip,
					unsigned offset)
{
	/*
	 * No-op: this function can only be called on the input chip.
	 * We do however make sure the offset is within range.
	 */
	return (offset < chip->ngpio) ? 0 : -EINVAL;
}

static int htcpld_chip_to_irq(struct gpio_chip *chip, unsigned offset)
{
	struct htcpld_chip *chip_data;

	chip_data = container_of(chip, struct htcpld_chip, chip_in);

	if (offset < chip_data->nirqs)
		return chip_data->irq_start + offset;
	else
		return -EINVAL;
}

static void htcpld_chip_reset(struct i2c_client *client)
{
	struct htcpld_chip *chip_data = i2c_get_clientdata(client);
	if (!chip_data)
		return;

	i2c_smbus_read_byte_data(
		client, (chip_data->cache_out = chip_data->reset));
}

static int htcpld_setup_chip_irq(
		struct platform_device *pdev,
		int chip_index)
{
	struct htcpld_data *htcpld;
	struct htcpld_chip *chip;
	unsigned int irq, irq_end;
	int ret = 0;

	/* Get the platform and driver data */
	htcpld = platform_get_drvdata(pdev);
	chip = &htcpld->chip[chip_index];

	/* Setup irq handlers */
	irq_end = chip->irq_start + chip->nirqs;
	for (irq = chip->irq_start; irq < irq_end; irq++) {
		irq_set_chip_and_handler(irq, &htcpld_muxed_chip,
					 handle_simple_irq);
		irq_set_chip_data(irq, chip);
#ifdef CONFIG_ARM
		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
#else
		irq_set_probe(irq);
#endif
	}

	return ret;
}

static int htcpld_register_chip_i2c(
		struct platform_device *pdev,
		int chip_index)
{
	struct htcpld_data *htcpld;
	struct device *dev = &pdev->dev;
	struct htcpld_core_platform_data *pdata;
	struct htcpld_chip *chip;
	struct htcpld_chip_platform_data *plat_chip_data;
	struct i2c_adapter *adapter;
	struct i2c_client *client;
	struct i2c_board_info info;

	/* Get the platform and driver data */
	pdata = dev_get_platdata(dev);
	htcpld = platform_get_drvdata(pdev);
	chip = &htcpld->chip[chip_index];
	plat_chip_data = &pdata->chip[chip_index];

	adapter = i2c_get_adapter(pdata->i2c_adapter_id);
	if (!adapter) {
		/* Eek, no such I2C adapter!  Bail out. */
		dev_warn(dev, "Chip at i2c address 0x%x: Invalid i2c adapter %d\n",
			 plat_chip_data->addr, pdata->i2c_adapter_id);
		return -ENODEV;
	}

	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_READ_BYTE_DATA)) {
		dev_warn(dev, "i2c adapter %d non-functional\n",
			 pdata->i2c_adapter_id);
		return -EINVAL;
	}

	memset(&info, 0, sizeof(struct i2c_board_info));
	info.addr = plat_chip_data->addr;
	strlcpy(info.type, "htcpld-chip", I2C_NAME_SIZE);
	info.platform_data = chip;

	/* Add the I2C device.  This calls the probe() function. */
	client = i2c_new_device(adapter, &info);
	if (!client) {
		/* I2C device registration failed, contineu with the next */
		dev_warn(dev, "Unable to add I2C device for 0x%x\n",
			 plat_chip_data->addr);
		return -ENODEV;
	}

	i2c_set_clientdata(client, chip);
	snprintf(client->name, I2C_NAME_SIZE, "Chip_0x%x", client->addr);
	chip->client = client;

	/* Reset the chip */
	htcpld_chip_reset(client);
	chip->cache_in = i2c_smbus_read_byte_data(client, chip->cache_out);

	return 0;
}

static void htcpld_unregister_chip_i2c(
		struct platform_device *pdev,
		int chip_index)
{
	struct htcpld_data *htcpld;
	struct htcpld_chip *chip;

	/* Get the platform and driver data */
	htcpld = platform_get_drvdata(pdev);
	chip = &htcpld->chip[chip_index];

	if (chip->client)
		i2c_unregister_device(chip->client);
}

static int htcpld_register_chip_gpio(
		struct platform_device *pdev,
		int chip_index)
{
	struct htcpld_data *htcpld;
	struct device *dev = &pdev->dev;
	struct htcpld_core_platform_data *pdata;
	struct htcpld_chip *chip;
	struct htcpld_chip_platform_data *plat_chip_data;
	struct gpio_chip *gpio_chip;
	int ret = 0;

	/* Get the platform and driver data */
	pdata = dev_get_platdata(dev);
	htcpld = platform_get_drvdata(pdev);
	chip = &htcpld->chip[chip_index];
	plat_chip_data = &pdata->chip[chip_index];

	/* Setup the GPIO chips */
	gpio_chip = &(chip->chip_out);
	gpio_chip->label           = "htcpld-out";
	gpio_chip->dev             = dev;
	gpio_chip->owner           = THIS_MODULE;
	gpio_chip->get             = htcpld_chip_get;
	gpio_chip->set             = htcpld_chip_set;
	gpio_chip->direction_input = NULL;
	gpio_chip->direction_output = htcpld_direction_output;
	gpio_chip->base            = plat_chip_data->gpio_out_base;
	gpio_chip->ngpio           = plat_chip_data->num_gpios;

	gpio_chip = &(chip->chip_in);
	gpio_chip->label           = "htcpld-in";
	gpio_chip->dev             = dev;
	gpio_chip->owner           = THIS_MODULE;
	gpio_chip->get             = htcpld_chip_get;
	gpio_chip->set             = NULL;
	gpio_chip->direction_input = htcpld_direction_input;
	gpio_chip->direction_output = NULL;
	gpio_chip->to_irq          = htcpld_chip_to_irq;
	gpio_chip->base            = plat_chip_data->gpio_in_base;
	gpio_chip->ngpio           = plat_chip_data->num_gpios;

	/* Add the GPIO chips */
	ret = gpiochip_add(&(chip->chip_out));
	if (ret) {
		dev_warn(dev, "Unable to register output GPIOs for 0x%x: %d\n",
			 plat_chip_data->addr, ret);
		return ret;
	}

	ret = gpiochip_add(&(chip->chip_in));
	if (ret) {
		dev_warn(dev, "Unable to register input GPIOs for 0x%x: %d\n",
			 plat_chip_data->addr, ret);
		gpiochip_remove(&(chip->chip_out));
		return ret;
	}

	return 0;
}

static int htcpld_setup_chips(struct platform_device *pdev)
{
	struct htcpld_data *htcpld;
	struct device *dev = &pdev->dev;
	struct htcpld_core_platform_data *pdata;
	int i;

	/* Get the platform and driver data */
	pdata = dev_get_platdata(dev);
	htcpld = platform_get_drvdata(pdev);

	/* Setup each chip's output GPIOs */
	htcpld->nchips = pdata->num_chip;
	htcpld->chip = devm_kzalloc(dev, sizeof(struct htcpld_chip) * htcpld->nchips,
				    GFP_KERNEL);
	if (!htcpld->chip) {
		dev_warn(dev, "Unable to allocate memory for chips\n");
		return -ENOMEM;
	}

	/* Add the chips as best we can */
	for (i = 0; i < htcpld->nchips; i++) {
		int ret;

		/* Setup the HTCPLD chips */
		htcpld->chip[i].reset = pdata->chip[i].reset;
		htcpld->chip[i].cache_out = pdata->chip[i].reset;
		htcpld->chip[i].cache_in = 0;
		htcpld->chip[i].dev = dev;
		htcpld->chip[i].irq_start = pdata->chip[i].irq_base;
		htcpld->chip[i].nirqs = pdata->chip[i].num_irqs;

		INIT_WORK(&(htcpld->chip[i].set_val_work), &htcpld_chip_set_ni);
		spin_lock_init(&(htcpld->chip[i].lock));

		/* Setup the interrupts for the chip */
		if (htcpld->chained_irq) {
			ret = htcpld_setup_chip_irq(pdev, i);
			if (ret)
				continue;
		}

		/* Register the chip with I2C */
		ret = htcpld_register_chip_i2c(pdev, i);
		if (ret)
			continue;


		/* Register the chips with the GPIO subsystem */
		ret = htcpld_register_chip_gpio(pdev, i);
		if (ret) {
			/* Unregister the chip from i2c and continue */
			htcpld_unregister_chip_i2c(pdev, i);
			continue;
		}

		dev_info(dev, "Registered chip at 0x%x\n", pdata->chip[i].addr);
	}

	return 0;
}

static int htcpld_core_probe(struct platform_device *pdev)
{
	struct htcpld_data *htcpld;
	struct device *dev = &pdev->dev;
	struct htcpld_core_platform_data *pdata;
	struct resource *res;
	int ret = 0;

	if (!dev)
		return -ENODEV;

	pdata = dev_get_platdata(dev);
	if (!pdata) {
		dev_warn(dev, "Platform data not found for htcpld core!\n");
		return -ENXIO;
	}

	htcpld = devm_kzalloc(dev, sizeof(struct htcpld_data), GFP_KERNEL);
	if (!htcpld)
		return -ENOMEM;

	/* Find chained irq */
	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
	if (res) {
		int flags;
		htcpld->chained_irq = res->start;

		/* Setup the chained interrupt handler */
		flags = IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING;
		ret = request_threaded_irq(htcpld->chained_irq,
					   NULL, htcpld_handler,
					   flags, pdev->name, htcpld);
		if (ret) {
			dev_warn(dev, "Unable to setup chained irq handler: %d\n", ret);
			return ret;
		} else
			device_init_wakeup(dev, 0);
	}

	/* Set the driver data */
	platform_set_drvdata(pdev, htcpld);

	/* Setup the htcpld chips */
	ret = htcpld_setup_chips(pdev);
	if (ret)
		return ret;

	/* Request the GPIO(s) for the int reset and set them up */
	if (pdata->int_reset_gpio_hi) {
		ret = gpio_request(pdata->int_reset_gpio_hi, "htcpld-core");
		if (ret) {
			/*
			 * If it failed, that sucks, but we can probably
			 * continue on without it.
			 */
			dev_warn(dev, "Unable to request int_reset_gpio_hi -- interrupts may not work\n");
			htcpld->int_reset_gpio_hi = 0;
		} else {
			htcpld->int_reset_gpio_hi = pdata->int_reset_gpio_hi;
			gpio_set_value(htcpld->int_reset_gpio_hi, 1);
		}
	}

	if (pdata->int_reset_gpio_lo) {
		ret = gpio_request(pdata->int_reset_gpio_lo, "htcpld-core");
		if (ret) {
			/*
			 * If it failed, that sucks, but we can probably
			 * continue on without it.
			 */
			dev_warn(dev, "Unable to request int_reset_gpio_lo -- interrupts may not work\n");
			htcpld->int_reset_gpio_lo = 0;
		} else {
			htcpld->int_reset_gpio_lo = pdata->int_reset_gpio_lo;
			gpio_set_value(htcpld->int_reset_gpio_lo, 0);
		}
	}

	dev_info(dev, "Initialized successfully\n");
	return 0;
}

/* The I2C Driver -- used internally */
static const struct i2c_device_id htcpld_chip_id[] = {
	{ "htcpld-chip", 0 },
	{ }
};
MODULE_DEVICE_TABLE(i2c, htcpld_chip_id);


static struct i2c_driver htcpld_chip_driver = {
	.driver = {
		.name	= "htcpld-chip",
	},
	.id_table = htcpld_chip_id,
};

/* The Core Driver */
static struct platform_driver htcpld_core_driver = {
	.driver = {
		.name = "i2c-htcpld",
	},
};

static int __init htcpld_core_init(void)
{
	int ret;

	/* Register the I2C Chip driver */
	ret = i2c_add_driver(&htcpld_chip_driver);
	if (ret)
		return ret;

	/* Probe for our chips */
	return platform_driver_probe(&htcpld_core_driver, htcpld_core_probe);
}

static void __exit htcpld_core_exit(void)
{
	i2c_del_driver(&htcpld_chip_driver);
	platform_driver_unregister(&htcpld_core_driver);
}

module_init(htcpld_core_init);
module_exit(htcpld_core_exit);

MODULE_AUTHOR("Cory Maccarrone <darkstar6262@gmail.com>");
MODULE_DESCRIPTION("I2C HTC PLD Driver");
MODULE_LICENSE("GPL");

qspinlock, hybrid pvqspinlock and unfair lock with various number of vCPUs are as follows: vCPUs pvqlock hybrid pvqlock unfair lock ----- ------- -------------- ----------- 30 342.1s 329.1s 329.1s 36 314.1s 305.3s 307.3s 45 345.0s 302.1s 306.6s 54 365.4s 308.6s 307.8s 72 358.9s 293.6s 303.9s 108 343.0s 285.9s 304.2s The hybrid pvqspinlock performs better or comparable to the unfair lock. By turning on QUEUED_LOCK_STAT, the table below showed the number of lock acquisitions in unfair mode and queue mode after a kernel build with various number of vCPUs. vCPUs queued mode unfair mode ----- ----------- ----------- 30 9,130,518 294,954 36 10,856,614 386,809 45 8,467,264 11,475,373 54 6,409,987 19,670,855 72 4,782,063 25,712,180 It can be seen that as the VM became more and more over-committed, the ratio of locks acquired in unfair mode increases. This is all done automatically to get the best overall performance as possible. Using a kernel locking microbenchmark with number of locking threads equals to the number of vCPUs available on the same machine, the minimum, average and maximum (min/avg/max) numbers of locking operations done per thread in a 5-second testing interval are shown below: vCPUs hybrid pvqlock unfair lock ----- -------------- ----------- 36 822,135/881,063/950,363 75,570/313,496/ 690,465 54 542,435/581,664/625,937 35,460/204,280/ 457,172 72 397,500/428,177/499,299 17,933/150,679/ 708,001 108 257,898/288,150/340,871 3,085/181,176/1,257,109 It can be seen that the hybrid pvqspinlocks are more fair and performant than the unfair locks in this test. The table below shows the kernel build times on a smaller 2-socket 16-core 32-thread E5-2620 v4 system. vCPUs pvqlock hybrid pvqlock unfair lock ----- ------- -------------- ----------- 16 436.8s 433.4s 435.6s 36 366.2s 364.8s 364.5s 48 423.6s 376.3s 370.2s 64 433.1s 376.6s 376.8s Again, the performance of the hybrid pvqspinlock was comparable to that of the unfair lock. Signed-off-by: Waiman Long <longman@redhat.com> Reviewed-by: Juergen Gross <jgross@suse.com> Reviewed-by: Eduardo Valentin <eduval@amazon.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1510089486-3466-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-11-07locking/rwlocks: Fix commentsCheng Jian - fix the list of locking API headers in kernel/locking/spinlock.c - fix an #endif comment Signed-off-by: Cheng Jian <cj.chengjian@huawei.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: huawei.libin@huawei.com Cc: xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1509706788-152547-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-11-07Merge branch 'linus' into locking/core, to resolve conflictsIngo Molnar Conflicts: include/linux/compiler-clang.h include/linux/compiler-gcc.h include/linux/compiler-intel.h include/uapi/linux/stddef.h Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-11-02License cleanup: add SPDX GPL-2.0 license identifier to files with no licenseGreg Kroah-Hartman Many source files in the tree are missing licensing information, which makes it harder for compliance tools to determine the correct license. By default all files without license information are under the default license of the kernel, which is GPL version 2. Update the files which contain no license information with the 'GPL-2.0' SPDX license identifier. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. How this work was done: Patches were generated and checked against linux-4.14-rc6 for a subset of the use cases: - file had no licensing information it it. - file was a */uapi/* one with no licensing information in it, - file was a */uapi/* one with existing licensing information, Further patches will be generated in subsequent months to fix up cases where non-standard license headers were used, and references to license had to be inferred by heuristics based on keywords. The analysis to determine which SPDX License Identifier to be applied to a file was done in a spreadsheet of side by side results from of the output of two independent scanners (ScanCode & Windriver) producing SPDX tag:value files created by Philippe Ombredanne. Philippe prepared the base worksheet, and did an initial spot review of a few 1000 files. The 4.13 kernel was the starting point of the analysis with 60,537 files assessed. Kate Stewart did a file by file comparison of the scanner results in the spreadsheet to determine which SPDX license identifier(s) to be applied to the file. She confirmed any determination that was not immediately clear with lawyers working with the Linux Foundation. Criteria used to select files for SPDX license identifier tagging was: - Files considered eligible had to be source code files. - Make and config files were included as candidates if they contained >5 lines of source - File already had some variant of a license header in it (even if <5 lines). All documentation files were explicitly excluded. The following heuristics were used to determine which SPDX license identifiers to apply. - when both scanners couldn't find any license traces, file was considered to have no license information in it, and the top level COPYING file license applied. For non */uapi/* files that summary was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 11139 and resulted in the first patch in this series. If that file was a */uapi/* path one, it was "GPL-2.0 WITH Linux-syscall-note" otherwise it was "GPL-2.0". Results of that was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 WITH Linux-syscall-note 930 and resulted in the second patch in this series. - if a file had some form of licensing information in it, and was one of the */uapi/* ones, it was denoted with the Linux-syscall-note if any GPL family license was found in the file or had no licensing in it (per prior point). Results summary: SPDX license identifier # files ---------------------------------------------------|------ GPL-2.0 WITH Linux-syscall-note 270 GPL-2.0+ WITH Linux-syscall-note 169 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) 21 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) 17 LGPL-2.1+ WITH Linux-syscall-note 15 GPL-1.0+ WITH Linux-syscall-note 14 ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) 5 LGPL-2.0+ WITH Linux-syscall-note 4 LGPL-2.1 WITH Linux-syscall-note 3 ((GPL-2.0 WITH Linux-syscall-note) OR MIT) 3 ((GPL-2.0 WITH Linux-syscall-note) AND MIT) 1 and that resulted in the third patch in this series. - when the two scanners agreed on the detected license(s), that became the concluded license(s). - when there was disagreement between the two scanners (one detected a license but the other didn't, or they both detected different licenses) a manual inspection of the file occurred. - In most cases a manual inspection of the information in the file resulted in a clear resolution of the license that should apply (and which scanner probably needed to revisit its heuristics). - When it was not immediately clear, the license identifier was confirmed with lawyers working with the Linux Foundation. - If there was any question as to the appropriate license identifier, the file was flagged for further research and to be revisited later in time. In total, over 70 hours of logged manual review was done on the spreadsheet to determine the SPDX license identifiers to apply to the source files by Kate, Philippe, Thomas and, in some cases, confirmation by lawyers working with the Linux Foundation. Kate also obtained a third independent scan of the 4.13 code base from FOSSology, and compared selected files where the other two scanners disagreed against that SPDX file, to see if there was new insights. The Windriver scanner is based on an older version of FOSSology in part, so they are related. Thomas did random spot checks in about 500 files from the spreadsheets for the uapi headers and agreed with SPDX license identifier in the files he inspected. For the non-uapi files Thomas did random spot checks in about 15000 files. In initial set of patches against 4.14-rc6, 3 files were found to have copy/paste license identifier errors, and have been fixed to reflect the correct identifier. Additionally Philippe spent 10 hours this week doing a detailed manual inspection and review of the 12,461 patched files from the initial patch version early this week with: - a full scancode scan run, collecting the matched texts, detected license ids and scores - reviewing anything where there was a license detected (about 500+ files) to ensure that the applied SPDX license was correct - reviewing anything where there was no detection but the patch license was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied SPDX license was correct This produced a worksheet with 20 files needing minor correction. This worksheet was then exported into 3 different .csv files for the different types of files to be modified. These .csv files were then reviewed by Greg. Thomas wrote a script to parse the csv files and add the proper SPDX tag to the file, in the format that the file expected. This script was further refined by Greg based on the output to detect more types of files automatically and to distinguish between header and source .c files (which need different comment types.) Finally Greg ran the script using the .csv files to generate the patches. Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org> Reviewed-by: Philippe Ombredanne <pombredanne@nexb.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2017-10-25locking/lockdep: Introduce CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK=yByungchul Park Add a Kconfig knob that enables the lockdep "crossrelease_fullstack" boot parameter. Suggested-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Byungchul Park <byungchul.park@lge.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-7-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-25locking/lockdep: Add a boot parameter allowing unwind in cross-release and ↵Byungchul Park disable it by default Johan Hovold reported a heavy performance regression caused by lockdep cross-release: > Boot time (from "Linux version" to login prompt) had in fact doubled > since 4.13 where it took 17 seconds (with my current config) compared to > the 35 seconds I now see with 4.14-rc4. > > I quick bisect pointed to lockdep and specifically the following commit: > > 28a903f63ec0 ("locking/lockdep: Handle non(or multi)-acquisition > of a crosslock") > > which I've verified is the commit which doubled the boot time (compared > to 28a903f63ec0^) (added by lockdep crossrelease series [1]). Currently cross-release performs unwind on every acquisition, but that is very expensive. This patch makes unwind optional and disables it by default and only records acquire_ip. Full stack traces are sometimes required for full analysis, in which case a boot paramter, crossrelease_fullstack, can be specified. On my qemu Ubuntu machine (x86_64, 4 cores, 512M), the regression was fixed. We measure boot times with 'perf stat --null --repeat 10 $QEMU', where $QEMU launches a kernel with init=/bin/true: 1. No lockdep enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.756558155 seconds time elapsed ( +- 0.09% ) 2. Lockdep enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.968710420 seconds time elapsed ( +- 0.12% ) 3. Lockdep enabled + cross-release enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 3.153839636 seconds time elapsed ( +- 0.31% ) 4. Lockdep enabled + cross-release enabled + this patch applied: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.963669551 seconds time elapsed ( +- 0.11% ) I.e. lockdep cross-release performance is now indistinguishable from vanilla lockdep. Bisected-by: Johan Hovold <johan@kernel.org> Analyzed-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: Johan Hovold <johan@kernel.org> Signed-off-by: Byungchul Park <byungchul.park@lge.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-25locking/qrwlock: Prevent slowpath writers getting held up by fastpathWill Deacon When a prospective writer takes the qrwlock locking slowpath due to the lock being held, it attempts to cmpxchg the wmode field from 0 to _QW_WAITING so that concurrent lockers also take the slowpath and queue on the spinlock accordingly, allowing the lockers to drain. Unfortunately, this isn't fair, because a fastpath writer that comes in after the lock is made available but before the _QW_WAITING flag is set can effectively jump the queue. If there is a steady stream of prospective writers, then the waiter will be held off indefinitely. This patch restores fairness by separating _QW_WAITING and _QW_LOCKED into two distinct fields: _QW_LOCKED continues to occupy the bottom byte of the lockword so that it can be cleared unconditionally when unlocking, but _QW_WAITING now occupies what used to be the bottom bit of the reader count. This then forces the slow-path for concurrent lockers. Tested-by: Waiman Long <longman@redhat.com> Tested-by: Jeremy Linton <jeremy.linton@arm.com> Tested-by: Adam Wallis <awallis@codeaurora.org> Tested-by: Jan Glauber <jglauber@cavium.com> Signed-off-by: Will Deacon <will.deacon@arm.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-6-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-25locking/qrwlock: Use atomic_cond_read_acquire() when spinning in qrwlockWill Deacon The qrwlock slowpaths involve spinning when either a prospective reader is waiting for a concurrent writer to drain, or a prospective writer is waiting for concurrent readers to drain. In both of these situations, atomic_cond_read_acquire() can be used to avoid busy-waiting and make use of any backoff functionality provided by the architecture. This patch replaces the open-code loops and rspin_until_writer_unlock() implementation with atomic_cond_read_acquire(). The write mode transition zero to _QW_WAITING is left alone, since (a) this doesn't need acquire semantics and (b) should be fast. Tested-by: Waiman Long <longman@redhat.com> Tested-by: Jeremy Linton <jeremy.linton@arm.com> Tested-by: Adam Wallis <awallis@codeaurora.org> Tested-by: Jan Glauber <jglauber@cavium.com> Signed-off-by: Will Deacon <will.deacon@arm.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-25locking/qrwlock: Use 'struct qrwlock' instead of 'struct __qrwlock'Will Deacon There's no good reason to keep the internal structure of struct qrwlock hidden from qrwlock.h, particularly as it's actually needed for unlock and ends up being abstracted independently behind the __qrwlock_write_byte() function. Stop pretending we can hide this stuff, and move the __qrwlock definition into qrwlock, removing the __qrwlock_write_byte() nastiness and using the same struct definition everywhere instead. Signed-off-by: Will Deacon <will.deacon@arm.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Waiman Long <longman@redhat.com> Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-10locking/core: Remove {read,spin,write}_can_lock()Will Deacon Outside of the locking code itself, {read,spin,write}_can_lock() have no users in tree. Apparmor (the last remaining user of write_can_lock()) got moved over to lockdep by the previous patch. This patch removes the use of {read,spin,write}_can_lock() from the BUILD_LOCK_OPS macro, deferring to the trylock operation for testing the lock status, and subsequently removes the unused macros altogether. They aren't guaranteed to work in a concurrent environment and can give incorrect results in the case of qrwlock. Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507055129-12300-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-10locking/rwsem: Add down_read_killable()Kirill Tkhai Similar to down_read() and down_write_killable(), add killable version of down_read(), based on __down_read_killable() function, added in previous patches. Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670119884.23930.2585570605960763239.stgit@localhost.localdomain Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-10-10locking/lockdep: Fix stacktrace messPeter Zijlstra There is some complication between check_prevs_add() and check_prev_add() wrt. saving stack traces. The problem is that we want to be frugal with saving stack traces, since it consumes static resources. We'll only know in check_prev_add() if we need the trace, but we can call into it multiple times. So we want to do on-demand and re-use. A further complication is that check_prev_add() can drop graph_lock and mess with our static resources. In any case, the current state; after commit: ce07a9415f26 ("locking/lockdep: Make check_prev_add() able to handle external stack_trace") is that we'll assume the trace contains valid data once check_prev_add() returns '2'. However, as noted by Josh, this is false, check_prev_add() can return '2' before having saved a trace, this then result in the possibility of using uninitialized data. Testing, as reported by Wu, shows a NULL deref. So simplify. Since the graph_lock() thing is a debug path that hasn't really been used in a long while, take it out back and avoid the head-ache. Further initialize the stack_trace to a known 'empty' state; as long as nr_entries == 0, nothing should deref entries. We can then use the 'entries == NULL' test for a valid trace / on-demand saving. Analyzed-by: Josh Poimboeuf <jpoimboe@redhat.com> Reported-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Fixes: ce07a9415f26 ("locking/lockdep: Make check_prev_add() able to handle external stack_trace") Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-09-29locking/rwsem-xadd: Fix missed wakeup due to reordering of loadPrateek Sood If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood <prsood@codeaurora.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-09-13mm: treewide: remove GFP_TEMPORARY allocation flagMichal Hocko GFP_TEMPORARY was introduced by commit e12ba74d8ff3 ("Group short-lived and reclaimable kernel allocations") along with __GFP_RECLAIMABLE. It's primary motivation was to allow users to tell that an allocation is short lived and so the allocator can try to place such allocations close together and prevent long term fragmentation. As much as this sounds like a reasonable semantic it becomes much less clear when to use the highlevel GFP_TEMPORARY allocation flag. How long is temporary? Can the context holding that memory sleep? Can it take locks? It seems there is no good answer for those questions. The current implementation of GFP_TEMPORARY is basically GFP_KERNEL | __GFP_RECLAIMABLE which in itself is tricky because basically none of the existing caller provide a way to reclaim the allocated memory. So this is rather misleading and hard to evaluate for any benefits. I have checked some random users and none of them has added the flag with a specific justification. I suspect most of them just copied from other existing users and others just thought it might be a good idea to use without any measuring. This suggests that GFP_TEMPORARY just motivates for cargo cult usage without any reasoning. I believe that our gfp flags are quite complex already and especially those with highlevel semantic should be clearly defined to prevent from confusion and abuse. Therefore I propose dropping GFP_TEMPORARY and replace all existing users to simply use GFP_KERNEL. Please note that SLAB users with shrinkers will still get __GFP_RECLAIMABLE heuristic and so they will be placed properly for memory fragmentation prevention. I can see reasons we might want some gfp flag to reflect shorterm allocations but I propose starting from a clear semantic definition and only then add users with proper justification. This was been brought up before LSF this year by Matthew [1] and it turned out that GFP_TEMPORARY really doesn't have a clear semantic. It seems to be a heuristic without any measured advantage for most (if not all) its current users. The follow up discussion has revealed that opinions on what might be temporary allocation differ a lot between developers. So rather than trying to tweak existing users into a semantic which they haven't expected I propose to simply remove the flag and start from scratch if we really need a semantic for short term allocations. [1] http://lkml.kernel.org/r/20170118054945.GD18349@bombadil.infradead.org [akpm@linux-foundation.org: fix typo] [akpm@linux-foundation.org: coding-style fixes] [sfr@canb.auug.org.au: drm/i915: fix up] Link: http://lkml.kernel.org/r/20170816144703.378d4f4d@canb.auug.org.au Link: http://lkml.kernel.org/r/20170728091904.14627-1-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Matthew Wilcox <willy@infradead.org> Cc: Neil Brown <neilb@suse.de> Cc: "Theodore Ts'o" <tytso@mit.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-08locking/rtmutex: replace top-waiter and pi_waiters leftmost cachingDavidlohr Bueso ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-10-dave@stgolabs.net Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-04Merge branch 'locking-core-for-linus' of ↵Linus Torvalds git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull locking updates from Ingo Molnar: - Add 'cross-release' support to lockdep, which allows APIs like completions, where it's not the 'owner' who releases the lock, to be tracked. It's all activated automatically under CONFIG_PROVE_LOCKING=y. - Clean up (restructure) the x86 atomics op implementation to be more readable, in preparation of KASAN annotations. (Dmitry Vyukov) - Fix static keys (Paolo Bonzini) - Add killable versions of down_read() et al (Kirill Tkhai) - Rework and fix jump_label locking (Marc Zyngier, Paolo Bonzini) - Rework (and fix) tlb_flush_pending() barriers (Peter Zijlstra) - Remove smp_mb__before_spinlock() and convert its usages, introduce smp_mb__after_spinlock() (Peter Zijlstra) * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (56 commits) locking/lockdep/selftests: Fix mixed read-write ABBA tests sched/completion: Avoid unnecessary stack allocation for COMPLETION_INITIALIZER_ONSTACK() acpi/nfit: Fix COMPLETION_INITIALIZER_ONSTACK() abuse locking/pvqspinlock: Relax cmpxchg's to improve performance on some architectures smp: Avoid using two cache lines for struct call_single_data locking/lockdep: Untangle xhlock history save/restore from task independence locking/refcounts, x86/asm: Disable CONFIG_ARCH_HAS_REFCOUNT for the time being futex: Remove duplicated code and fix undefined behaviour Documentation/locking/atomic: Finish the document... locking/lockdep: Fix workqueue crossrelease annotation workqueue/lockdep: 'Fix' flush_work() annotation locking/lockdep/selftests: Add mixed read-write ABBA tests mm, locking/barriers: Clarify tlb_flush_pending() barriers locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE and CONFIG_LOCKDEP_COMPLETIONS truly non-interactive locking/lockdep: Explicitly initialize wq_barrier::done::map locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to CONFIG_LOCKDEP_COMPLETIONS locking/lockdep: Reword title of LOCKDEP_CROSSRELEASE config locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING locking/refcounts, x86/asm: Implement fast refcount overflow protection locking/lockdep: Fix the rollback and overwrite detection logic in crossrelease ... 2017-08-29locking/pvqspinlock: Relax cmpxchg's to improve performance on some ↵Waiman Long architectures All the locking related cmpxchg's in the following functions are replaced with the _acquire variants: - pv_queued_spin_steal_lock() - trylock_clear_pending() This change should help performance on architectures that use LL/SC. The cmpxchg in pv_kick_node() is replaced with a relaxed version with explicit memory barrier to make sure that it is fully ordered in the writing of next->lock and the reading of pn->state whether the cmpxchg is a success or failure without affecting performance in non-LL/SC architectures. On a 2-socket 12-core 96-thread Power8 system with pvqspinlock explicitly enabled, the performance of a locking microbenchmark with and without this patch on a 4.13-rc4 kernel with Xinhui's PPC qspinlock patch were as follows: # of thread w/o patch with patch % Change ----------- --------- ---------- -------- 8 5054.8 Mop/s 5209.4 Mop/s +3.1% 16 3985.0 Mop/s 4015.0 Mop/s +0.8% 32 2378.2 Mop/s 2396.0 Mop/s +0.7% Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrea Parri <parri.andrea@gmail.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Pan Xinhui <xinhui@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will.deacon@arm.com> Link: http://lkml.kernel.org/r/1502741222-24360-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-08-29locking/lockdep: Untangle xhlock history save/restore from task independencePeter Zijlstra Where XHLOCK_{SOFT,HARD} are save/restore points in the xhlocks[] to ensure the temporal IRQ events don't interact with task state, the XHLOCK_PROC is a fundament different beast that just happens to share the interface. The purpose of XHLOCK_PROC is to annotate independent execution inside one task. For example workqueues, each work should appear to run in its own 'pristine' 'task'. Remove XHLOCK_PROC in favour of its own interface to avoid confusion. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: boqun.feng@gmail.com Cc: david@fromorbit.com Cc: johannes@sipsolutions.net Cc: kernel-team@lge.com Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20170829085939.ggmb6xiohw67micb@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-08-25locking/lockdep: Fix workqueue crossrelease annotationPeter Zijlstra The new completion/crossrelease annotations interact unfavourable with the extant flush_work()/flush_workqueue() annotations. The problem is that when a single work class does: wait_for_completion(&C) and complete(&C) in different executions, we'll build dependencies like: lock_map_acquire(W) complete_acquire(C) and lock_map_acquire(W) complete_release(C) which results in the dependency chain: W->C->W, which lockdep thinks spells deadlock, even though there is no deadlock potential since works are ran concurrently. One possibility would be to change the work 'lock' to recursive-read, but that would mean hitting a lockdep limitation on recursive locks. Also, unconditinoally switching to recursive-read here would fail to detect the actual deadlock on single-threaded workqueues, which do have a problem with this. For now, forcefully disregard these locks for crossrelease. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Tejun Heo <tj@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: boqun.feng@gmail.com Cc: byungchul.park@lge.com Cc: david@fromorbit.com Cc: johannes@sipsolutions.net Cc: oleg@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> 2017-08-17locking: Remove spin_unlock_wait() generic definitionsPaul E. McKenney There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore removes spin_unlock_wait() and related definitions from core code. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Andrea Parri <parri.andrea@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org>