xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision a2d9c45398ebcab924c89b2d4961bbebc2aeb3d6)
1 /*
2  * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/list.h>
35 
36 #include "EndpointManager.h"
37 
38 
39 // References:
40 //  - RFC 793 - Transmission Control Protocol
41 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
42 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
43 //
44 // Things this implementation currently doesn't implement:
45 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
46 //	  RFC 2001, RFC 2581, RFC 3042
47 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
48 //	- Explicit Congestion Notification (ECN), RFC 3168
49 //	- SYN-Cache
50 //	- TCP Extensions for High Performance, RFC 1323
51 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
52 //	- Forward RTO-Recovery, RFC 4138
53 //	- Time-Wait hash instead of keeping sockets alive
54 
55 #define PrintAddress(address) \
56 	AddressString(Domain(), address, true).Data()
57 
58 //#define TRACE_TCP
59 //#define PROBE_TCP
60 
61 #ifdef TRACE_TCP
62 // the space before ', ##args' is important in order for this to work with cpp 2.95
63 #	define TRACE(format, args...)	dprintf("%ld: TCP [%llu] %p (%12s) " \
64 		format "\n", find_thread(NULL), system_time(), this, \
65 		name_for_state(fState) , ##args)
66 #else
67 #	define TRACE(args...)			do { } while (0)
68 #endif
69 
70 #ifdef PROBE_TCP
71 #	define PROBE(buffer, window) \
72 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
73 		system_time(), PrintAddress(buffer->source), \
74 		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
75 		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
76 		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
77 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
78 #else
79 #	define PROBE(buffer, window)	do { } while (0)
80 #endif
81 
82 #if TCP_TRACING
83 namespace TCPTracing {
84 
85 class Receive : public AbstractTraceEntry {
86 public:
87 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
88 			net_buffer* buffer)
89 		:
90 		fEndpoint(endpoint),
91 		fBuffer(buffer),
92 		fBufferSize(buffer->size),
93 		fSequence(segment.sequence),
94 		fAcknowledge(segment.acknowledge),
95 		fWindow(window),
96 		fState(endpoint->State()),
97 		fFlags(segment.flags)
98 	{
99 		Initialized();
100 	}
101 
102 	virtual void AddDump(TraceOutput& out)
103 	{
104 		out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, "
105 			"seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState),
106 			fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
107 	}
108 
109 protected:
110 	TCPEndpoint*	fEndpoint;
111 	net_buffer*		fBuffer;
112 	uint32			fBufferSize;
113 	uint32			fSequence;
114 	uint32			fAcknowledge;
115 	uint32			fWindow;
116 	tcp_state		fState;
117 	uint8			fFlags;
118 };
119 
120 class Send : public AbstractTraceEntry {
121 public:
122 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
123 			tcp_sequence firstSequence, tcp_sequence lastSequence)
124 		:
125 		fEndpoint(endpoint),
126 		fBuffer(buffer),
127 		fBufferSize(buffer->size),
128 		fSequence(segment.sequence),
129 		fAcknowledge(segment.acknowledge),
130 		fFirstSequence(firstSequence.Number()),
131 		fLastSequence(lastSequence.Number()),
132 		fState(endpoint->State()),
133 		fFlags(segment.flags)
134 	{
135 		Initialized();
136 	}
137 
138 	virtual void AddDump(TraceOutput& out)
139 	{
140 		out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
141 			"seq %lu, ack %lu, first %lu, last %lu",
142 			fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags,
143 			fSequence, fAcknowledge, fFirstSequence, fLastSequence);
144 	}
145 
146 protected:
147 	TCPEndpoint*	fEndpoint;
148 	net_buffer*		fBuffer;
149 	uint32			fBufferSize;
150 	uint32			fSequence;
151 	uint32			fAcknowledge;
152 	uint32			fFirstSequence;
153 	uint32			fLastSequence;
154 	tcp_state		fState;
155 	uint8			fFlags;
156 };
157 
158 class State : public AbstractTraceEntry {
159 public:
160 	State(TCPEndpoint* endpoint)
161 		:
162 		fEndpoint(endpoint),
163 		fState(endpoint->State())
164 	{
165 		Initialized();
166 	}
167 
168 	virtual void AddDump(TraceOutput& out)
169 	{
170 		out.Print("tcp:%p (%12s) state change", fEndpoint,
171 			name_for_state(fState));
172 	}
173 
174 protected:
175 	TCPEndpoint*	fEndpoint;
176 	tcp_state		fState;
177 };
178 
179 class Spawn : public AbstractTraceEntry {
180 public:
181 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
182 		:
183 		fListeningEndpoint(listeningEndpoint),
184 		fSpawnedEndpoint(spawnedEndpoint)
185 	{
186 		Initialized();
187 	}
188 
189 	virtual void AddDump(TraceOutput& out)
190 	{
191 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
192 	}
193 
194 protected:
195 	TCPEndpoint*	fListeningEndpoint;
196 	TCPEndpoint*	fSpawnedEndpoint;
197 };
198 
199 class Error : public AbstractTraceEntry {
200 public:
201 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
202 		:
203 		fEndpoint(endpoint),
204 		fLine(line),
205 		fError(error),
206 		fState(endpoint->State())
207 	{
208 		Initialized();
209 	}
210 
211 	virtual void AddDump(TraceOutput& out)
212 	{
213 		out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint,
214 			name_for_state(fState), fLine, fError);
215 	}
216 
217 protected:
218 	TCPEndpoint*	fEndpoint;
219 	int32			fLine;
220 	const char*		fError;
221 	tcp_state		fState;
222 };
223 
224 class Timer : public AbstractTraceEntry {
225 public:
226 	Timer(TCPEndpoint* endpoint, const char* which)
227 		:
228 		fEndpoint(endpoint),
229 		fWhich(which),
230 		fState(endpoint->State())
231 	{
232 		Initialized();
233 	}
234 
235 	virtual void AddDump(TraceOutput& out)
236 	{
237 		out.Print("tcp:%p (%12s) %s timer", fEndpoint,
238 			name_for_state(fState), fWhich);
239 	}
240 
241 protected:
242 	TCPEndpoint*	fEndpoint;
243 	const char*		fWhich;
244 	tcp_state		fState;
245 };
246 
247 }	// namespace TCPTracing
248 
249 #	define T(x)	new(std::nothrow) TCPTracing::x
250 #else
251 #	define T(x)
252 #endif	// TCP_TRACING
253 
254 // Initial estimate for packet round trip time (RTT)
255 #define TCP_INITIAL_RTT		2000000
256 
257 // constants for the fFlags field
258 enum {
259 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
260 	FLAG_OPTION_TIMESTAMP		= 0x02,
261 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
262 	//       That is, what is expected from accept() after a shutdown()
263 	//       is performed on a listen()ing socket.
264 	FLAG_NO_RECEIVE				= 0x04,
265 	FLAG_CLOSED					= 0x08,
266 	FLAG_DELETE_ON_CLOSE		= 0x10,
267 	FLAG_LOCAL					= 0x20
268 };
269 
270 
271 static const int kTimestampFactor = 1024;
272 
273 
274 static inline bigtime_t
275 absolute_timeout(bigtime_t timeout)
276 {
277 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
278 		return timeout;
279 
280 	return timeout + system_time();
281 }
282 
283 
284 static inline status_t
285 posix_error(status_t error)
286 {
287 	if (error == B_TIMED_OUT)
288 		return B_WOULD_BLOCK;
289 
290 	return error;
291 }
292 
293 
294 static inline bool
295 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
296 	uint32 receiveWindow)
297 {
298 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
299 }
300 
301 
302 static inline bool
303 segment_in_sequence(const tcp_segment_header& segment, int size,
304 	const tcp_sequence& receiveNext, uint32 receiveWindow)
305 {
306 	tcp_sequence sequence(segment.sequence);
307 	if (size == 0) {
308 		if (receiveWindow == 0)
309 			return sequence == receiveNext;
310 		return in_window(sequence, receiveNext, receiveWindow);
311 	} else {
312 		if (receiveWindow == 0)
313 			return false;
314 		return in_window(sequence, receiveNext, receiveWindow)
315 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
316 	}
317 }
318 
319 
320 static inline bool
321 is_writable(tcp_state state)
322 {
323 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
324 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
325 }
326 
327 
328 static inline uint32 tcp_now()
329 {
330 	return system_time() / kTimestampFactor;
331 }
332 
333 
334 static inline uint32 tcp_diff_timestamp(uint32 base)
335 {
336 	uint32 now = tcp_now();
337 
338 	if (now > base)
339 		return now - base;
340 
341 	return now + UINT_MAX - base;
342 }
343 
344 
345 static inline bool
346 state_needs_finish(int32 state)
347 {
348 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
349 		|| state == FINISH_SENT || state == CLOSING;
350 }
351 
352 
353 //	#pragma mark -
354 
355 
356 WaitList::WaitList(const char* name)
357 {
358 	fCondition = 0;
359 	fSem = create_sem(0, name);
360 }
361 
362 
363 WaitList::~WaitList()
364 {
365 	delete_sem(fSem);
366 }
367 
368 
369 status_t
370 WaitList::InitCheck() const
371 {
372 	return fSem;
373 }
374 
375 
376 status_t
377 WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
378 {
379 	locker.Unlock();
380 
381 	status_t status = B_OK;
382 
383 	while (!atomic_test_and_set(&fCondition, 0, 1)) {
384 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
385 			timeout);
386 		if (status != B_OK)
387 			break;
388 	}
389 
390 	locker.Lock();
391 	return status;
392 }
393 
394 
395 void
396 WaitList::Signal()
397 {
398 	atomic_or(&fCondition, 1);
399 #ifdef __HAIKU__
400 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
401 #else
402 	int32 count;
403 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
404 		release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
405 #endif
406 }
407 
408 
409 //	#pragma mark -
410 
411 
412 TCPEndpoint::TCPEndpoint(net_socket* socket)
413 	:
414 	ProtocolSocket(socket),
415 	fManager(NULL),
416 	fReceiveList("tcp receive"),
417 	fSendList("tcp send"),
418 	fOptions(0),
419 	fSendWindowShift(0),
420 	fReceiveWindowShift(0),
421 	fSendUnacknowledged(0),
422 	fSendNext(0),
423 	fSendMax(0),
424 	fSendUrgentOffset(0),
425 	fSendWindow(0),
426 	fSendMaxWindow(0),
427 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
428 	fSendQueue(socket->send.buffer_size),
429 	fInitialSendSequence(0),
430 	fDuplicateAcknowledgeCount(0),
431 	fRoute(NULL),
432 	fReceiveNext(0),
433 	fReceiveMaxAdvertised(0),
434 	fReceiveWindow(socket->receive.buffer_size),
435 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
436 	fReceiveQueue(socket->receive.buffer_size),
437 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
438 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
439 	fRetransmitTimeout(TCP_INITIAL_RTT),
440 	fReceivedTimestamp(0),
441 	fCongestionWindow(0),
442 	fSlowStartThreshold(0),
443 	fState(CLOSED),
444 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
445 {
446 	// TODO: to be replaced with a real read/write locking strategy!
447 	mutex_init(&fLock, "tcp lock");
448 
449 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
450 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
451 		this);
452 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
453 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
454 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
455 		this);
456 }
457 
458 
459 TCPEndpoint::~TCPEndpoint()
460 {
461 	mutex_lock(&fLock);
462 
463 	_CancelConnectionTimers();
464 	gStackModule->cancel_timer(&fTimeWaitTimer);
465 
466 	if (fManager != NULL) {
467 		fManager->Unbind(this);
468 		put_endpoint_manager(fManager);
469 	}
470 
471 	mutex_destroy(&fLock);
472 
473 	// we need to wait for all timers to return
474 	gStackModule->wait_for_timer(&fRetransmitTimer);
475 	gStackModule->wait_for_timer(&fPersistTimer);
476 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
477 	gStackModule->wait_for_timer(&fTimeWaitTimer);
478 
479 	gDatalinkModule->put_route(Domain(), fRoute);
480 }
481 
482 
483 status_t
484 TCPEndpoint::InitCheck() const
485 {
486 	if (fReceiveList.InitCheck() < B_OK)
487 		return fReceiveList.InitCheck();
488 
489 	if (fSendList.InitCheck() < B_OK)
490 		return fSendList.InitCheck();
491 
492 	return B_OK;
493 }
494 
495 
496 //	#pragma mark - protocol API
497 
498 
499 status_t
500 TCPEndpoint::Open()
501 {
502 	TRACE("Open()");
503 
504 	status_t status = ProtocolSocket::Open();
505 	if (status < B_OK)
506 		return status;
507 
508 	fManager = get_endpoint_manager(Domain());
509 	if (fManager == NULL)
510 		return EAFNOSUPPORT;
511 
512 	return B_OK;
513 }
514 
515 
516 status_t
517 TCPEndpoint::Close()
518 {
519 	TRACE("Close()");
520 
521 	MutexLocker locker(fLock);
522 
523 	if (fState == LISTEN)
524 		delete_sem(fAcceptSemaphore);
525 
526 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
527 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
528 		fState = CLOSED;
529 		T(State(this));
530 		return B_OK;
531 	}
532 
533 	status_t status = _Disconnect(true);
534 	if (status != B_OK)
535 		return status;
536 
537 	if (socket->options & SO_LINGER) {
538 		TRACE("Close(): Lingering for %i secs", socket->linger);
539 
540 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
541 
542 		while (fSendQueue.Used() > 0) {
543 			status = fSendList.Wait(locker, maximum);
544 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
545 				break;
546 			else if (status < B_OK)
547 				return status;
548 		}
549 
550 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
551 			fSendQueue.Used());
552 	}
553 	return B_OK;
554 }
555 
556 
557 void
558 TCPEndpoint::Free()
559 {
560 	TRACE("Free()");
561 
562 	MutexLocker _(fLock);
563 
564 	if (fState <= SYNCHRONIZE_SENT)
565 		return;
566 
567 	// we are only interested in the timer, not in changing state
568 	_EnterTimeWait();
569 
570 	fFlags |= FLAG_CLOSED;
571 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
572 		// we'll be freed later when the 2MSL timer expires
573 		gSocketModule->acquire_socket(socket);
574 	}
575 }
576 
577 
578 /*!	Creates and sends a synchronize packet to /a address, and then waits
579 	until the connection has been established or refused.
580 */
581 status_t
582 TCPEndpoint::Connect(const sockaddr* address)
583 {
584 	TRACE("Connect() on address %s", PrintAddress(address));
585 
586 	if (!AddressModule()->is_same_family(address))
587 		return EAFNOSUPPORT;
588 
589 	MutexLocker locker(fLock);
590 
591 	if (gStackModule->is_restarted_syscall()) {
592 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
593 		status_t status = _WaitForEstablished(locker, timeout);
594 		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
595 			strerror(status), timeout);
596 		return posix_error(status);
597 	}
598 
599 	// Can only call connect() from CLOSED or LISTEN states
600 	// otherwise endpoint is considered already connected
601 	if (fState == LISTEN) {
602 		// this socket is about to connect; remove pending connections in the backlog
603 		gSocketModule->set_max_backlog(socket, 0);
604 	} else if (fState == ESTABLISHED) {
605 		return EISCONN;
606 	} else if (fState != CLOSED)
607 		return EINPROGRESS;
608 
609 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
610 	sockaddr_storage _address;
611 	if (AddressModule()->is_empty_address(address, false)) {
612 		AddressModule()->get_loopback_address((sockaddr *)&_address);
613 		// for IPv4 and IPv6 the port is at the same offset
614 		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
615 		address = (sockaddr *)&_address;
616 	}
617 
618 	status_t status = _PrepareSendPath(address);
619 	if (status < B_OK)
620 		return status;
621 
622 	TRACE("  Connect(): starting 3-way handshake...");
623 
624 	fState = SYNCHRONIZE_SENT;
625 	T(State(this));
626 
627 	// send SYN
628 	status = _SendQueued();
629 	if (status != B_OK) {
630 		_Close();
631 		return status;
632 	}
633 
634 	// If we are running over Loopback, after _SendQueued() returns we
635 	// may be in ESTABLISHED already.
636 	if (fState == ESTABLISHED) {
637 		TRACE("  Connect() completed after _SendQueued()");
638 		return B_OK;
639 	}
640 
641 	// wait until 3-way handshake is complete (if needed)
642 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
643 	if (timeout == 0) {
644 		// we're a non-blocking socket
645 		TRACE("  Connect() delayed, return EINPROGRESS");
646 		return EINPROGRESS;
647 	}
648 
649 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
650 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
651 
652 	status = _WaitForEstablished(locker, absoluteTimeout);
653 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
654 		strerror(status), timeout);
655 	return posix_error(status);
656 }
657 
658 
659 status_t
660 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
661 {
662 	TRACE("Accept()");
663 
664 	MutexLocker locker(fLock);
665 
666 	status_t status;
667 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
668 	if (gStackModule->is_restarted_syscall())
669 		timeout = gStackModule->restore_syscall_restart_timeout();
670 	else
671 		gStackModule->store_syscall_restart_timeout(timeout);
672 
673 	do {
674 		locker.Unlock();
675 
676 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
677 			| B_CAN_INTERRUPT, timeout);
678 		if (status != B_OK) {
679 			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
680 				return B_WOULD_BLOCK;
681 
682 			return status;
683 		}
684 
685 		locker.Lock();
686 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
687 #ifdef TRACE_TCP
688 		if (status == B_OK)
689 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
690 #endif
691 	} while (status != B_OK);
692 
693 	return status;
694 }
695 
696 
697 status_t
698 TCPEndpoint::Bind(const sockaddr *address)
699 {
700 	if (address == NULL)
701 		return B_BAD_VALUE;
702 
703 	MutexLocker lock(fLock);
704 
705 	TRACE("Bind() on address %s", PrintAddress(address));
706 
707 	if (fState != CLOSED)
708 		return EISCONN;
709 
710 	return fManager->Bind(this, address);
711 }
712 
713 
714 status_t
715 TCPEndpoint::Unbind(struct sockaddr *address)
716 {
717 	TRACE("Unbind()");
718 
719 	MutexLocker _(fLock);
720 	return fManager->Unbind(this);
721 }
722 
723 
724 status_t
725 TCPEndpoint::Listen(int count)
726 {
727 	TRACE("Listen()");
728 
729 	MutexLocker _(fLock);
730 
731 	if (fState != CLOSED && fState != LISTEN)
732 		return B_BAD_VALUE;
733 
734 	if (fState == CLOSED) {
735 		fAcceptSemaphore = create_sem(0, "tcp accept");
736 		if (fAcceptSemaphore < B_OK)
737 			return ENOBUFS;
738 
739 		status_t status = fManager->SetPassive(this);
740 		if (status != B_OK) {
741 			delete_sem(fAcceptSemaphore);
742 			fAcceptSemaphore = -1;
743 			return status;
744 		}
745 	}
746 
747 	gSocketModule->set_max_backlog(socket, count);
748 
749 	fState = LISTEN;
750 	T(State(this));
751 	return B_OK;
752 }
753 
754 
755 status_t
756 TCPEndpoint::Shutdown(int direction)
757 {
758 	TRACE("Shutdown(%i)", direction);
759 
760 	MutexLocker lock(fLock);
761 
762 	if (direction == SHUT_RD || direction == SHUT_RDWR)
763 		fFlags |= FLAG_NO_RECEIVE;
764 
765 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
766 		// TODO: That's not correct. After read/write shutting down the socket
767 		// one should still be able to read previously arrived data.
768 		_Disconnect(false);
769 	}
770 
771 	return B_OK;
772 }
773 
774 
775 /*!	Puts data contained in \a buffer into send buffer */
776 status_t
777 TCPEndpoint::SendData(net_buffer *buffer)
778 {
779 	MutexLocker lock(fLock);
780 
781 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
782 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
783 		  fSendQueue.Free());
784 
785 	if (fState == CLOSED)
786 		return ENOTCONN;
787 	if (fState == LISTEN)
788 		return EDESTADDRREQ;
789 	if (!is_writable(fState)) {
790 		// we only send signals when called from userland
791 		if (gStackModule->is_syscall())
792 			send_signal(find_thread(NULL), SIGPIPE);
793 		return EPIPE;
794 	}
795 
796 	uint32 flags = buffer->flags;
797 	size_t left = buffer->size;
798 
799 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
800 	if (gStackModule->is_restarted_syscall())
801 		timeout = gStackModule->restore_syscall_restart_timeout();
802 	else
803 		gStackModule->store_syscall_restart_timeout(timeout);
804 
805 	while (left > 0) {
806 		while (fSendQueue.Free() < socket->send.low_water_mark) {
807 			// wait until enough space is available
808 			status_t status = fSendList.Wait(lock, timeout);
809 			if (status < B_OK) {
810 				TRACE("  SendData() returning %s (%d)",
811 					strerror(posix_error(status)), (int)posix_error(status));
812 				return posix_error(status);
813 			}
814 
815 			if (!is_writable(fState)) {
816 				// we only send signals when called from userland
817 				if (gStackModule->is_syscall())
818 					send_signal(find_thread(NULL), SIGPIPE);
819 				return EPIPE;
820 			}
821 		}
822 
823 		size_t size = fSendQueue.Free();
824 		if (size < left) {
825 			// we need to split the original buffer
826 			net_buffer* clone = gBufferModule->clone(buffer, false);
827 				// TODO: add offset/size parameter to net_buffer::clone() or
828 				// even a move_data() function, as this is a bit inefficient
829 			if (clone == NULL)
830 				return ENOBUFS;
831 
832 			status_t status = gBufferModule->trim(clone, size);
833 			if (status != B_OK) {
834 				gBufferModule->free(clone);
835 				return status;
836 			}
837 
838 			gBufferModule->remove_header(buffer, size);
839 			left -= size;
840 			fSendQueue.Add(clone);
841 		} else {
842 			left -= buffer->size;
843 			fSendQueue.Add(buffer);
844 		}
845 	}
846 
847 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
848 
849 	bool force = false;
850 	if ((flags & MSG_OOB) != 0) {
851 		fSendUrgentOffset = fSendQueue.LastSequence();
852 			// RFC 961 specifies that the urgent offset points to the last
853 			// byte of urgent data. However, this is commonly implemented as
854 			// here, ie. it points to the first byte after the urgent data.
855 		force = true;
856 	}
857 	if ((flags & MSG_EOF) != 0)
858 		_Disconnect(false);
859 
860 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
861 		_SendQueued(force);
862 
863 	return B_OK;
864 }
865 
866 
867 ssize_t
868 TCPEndpoint::SendAvailable()
869 {
870 	MutexLocker locker(fLock);
871 
872 	ssize_t available;
873 
874 	if (is_writable(fState))
875 		available = fSendQueue.Free();
876 	else
877 		available = EPIPE;
878 
879 	TRACE("SendAvailable(): %li", available);
880 	return available;
881 }
882 
883 
884 status_t
885 TCPEndpoint::FillStat(net_stat *stat)
886 {
887 	MutexLocker _(fLock);
888 
889 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
890 	stat->receive_queue_size = fReceiveQueue.Available();
891 	stat->send_queue_size = fSendQueue.Used();
892 
893 	return B_OK;
894 }
895 
896 
897 status_t
898 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
899 {
900 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
901 
902 	MutexLocker locker(fLock);
903 
904 	*_buffer = NULL;
905 
906 	if (fState == CLOSED)
907 		return ENOTCONN;
908 
909 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
910 	if (gStackModule->is_restarted_syscall())
911 		timeout = gStackModule->restore_syscall_restart_timeout();
912 	else
913 		gStackModule->store_syscall_restart_timeout(timeout);
914 
915 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
916 		if (flags & MSG_DONTWAIT)
917 			return B_WOULD_BLOCK;
918 
919 		status_t status = _WaitForEstablished(locker, timeout);
920 		if (status < B_OK)
921 			return posix_error(status);
922 	}
923 
924 	size_t dataNeeded = socket->receive.low_water_mark;
925 
926 	// When MSG_WAITALL is set then the function should block
927 	// until the full amount of data can be returned.
928 	if (flags & MSG_WAITALL)
929 		dataNeeded = numBytes;
930 
931 	// TODO: add support for urgent data (MSG_OOB)
932 
933 	while (true) {
934 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
935 			|| fState == TIME_WAIT) {
936 			// ``Connection closing''.
937 			return B_OK;
938 		}
939 
940 		if (fReceiveQueue.Available() > 0) {
941 			if (fReceiveQueue.Available() >= dataNeeded
942 				|| (fReceiveQueue.PushedData() > 0
943 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
944 				break;
945 		} else if (fState == FINISH_RECEIVED) {
946 			// ``If no text is awaiting delivery, the RECEIVE will
947 			//   get a Connection closing''.
948 			return B_OK;
949 		}
950 
951 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
952 			return B_WOULD_BLOCK;
953 
954 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
955 			return B_OK;
956 
957 		status_t status = fReceiveList.Wait(locker, timeout);
958 		if (status < B_OK) {
959 			// The Open Group base specification mentions that EINTR should be
960 			// returned if the recv() is interrupted before _any data_ is
961 			// available. So we actually check if there is data, and if so,
962 			// push it to the user.
963 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
964 				&& fReceiveQueue.Available() > 0)
965 				break;
966 
967 			return posix_error(status);
968 		}
969 	}
970 
971 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
972 
973 	if (numBytes < fReceiveQueue.Available())
974 		fReceiveList.Signal();
975 
976 	bool clone = (flags & MSG_PEEK) != 0;
977 
978 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
979 
980 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
981 
982 	// if we are opening the window, check if we should send an ACK
983 	if (!clone)
984 		SendAcknowledge(false);
985 
986 	return receivedBytes;
987 }
988 
989 
990 ssize_t
991 TCPEndpoint::ReadAvailable()
992 {
993 	MutexLocker locker(fLock);
994 
995 	TRACE("ReadAvailable(): %li", _AvailableData());
996 
997 	return _AvailableData();
998 }
999 
1000 
1001 status_t
1002 TCPEndpoint::SetSendBufferSize(size_t length)
1003 {
1004 	MutexLocker _(fLock);
1005 	fSendQueue.SetMaxBytes(length);
1006 	return B_OK;
1007 }
1008 
1009 
1010 status_t
1011 TCPEndpoint::SetReceiveBufferSize(size_t length)
1012 {
1013 	MutexLocker _(fLock);
1014 	fReceiveQueue.SetMaxBytes(length);
1015 	return B_OK;
1016 }
1017 
1018 
1019 status_t
1020 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1021 {
1022 	if (*_length != sizeof(int))
1023 		return B_BAD_VALUE;
1024 
1025 	int* value = (int*)_value;
1026 
1027 	switch (option) {
1028 		case TCP_NODELAY:
1029 			if ((fOptions & TCP_NODELAY) != 0)
1030 				*value = 1;
1031 			else
1032 				*value = 0;
1033 			return B_OK;
1034 
1035 		case TCP_MAXSEG:
1036 			*value = fReceiveMaxSegmentSize;
1037 			return B_OK;
1038 
1039 		default:
1040 			return B_BAD_VALUE;
1041 	}
1042 }
1043 
1044 
1045 status_t
1046 TCPEndpoint::SetOption(int option, const void* _value, int length)
1047 {
1048 	if (option != TCP_NODELAY)
1049 		return B_BAD_VALUE;
1050 
1051 	if (length != sizeof(int))
1052 		return B_BAD_VALUE;
1053 
1054 	const int* value = (const int*)_value;
1055 
1056 	MutexLocker _(fLock);
1057 	if (*value)
1058 		fOptions |= TCP_NODELAY;
1059 	else
1060 		fOptions &= ~TCP_NODELAY;
1061 
1062 	return B_OK;
1063 }
1064 
1065 
1066 //	#pragma mark - misc
1067 
1068 
1069 bool
1070 TCPEndpoint::IsBound() const
1071 {
1072 	return !LocalAddress().IsEmpty(true);
1073 }
1074 
1075 
1076 bool
1077 TCPEndpoint::IsLocal() const
1078 {
1079 	return (fFlags & FLAG_LOCAL) != 0;
1080 }
1081 
1082 
1083 status_t
1084 TCPEndpoint::DelayedAcknowledge()
1085 {
1086 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1087 		// timer was active, send an ACK now (with the exception above,
1088 		// we send every other ACK)
1089 		return SendAcknowledge(true);
1090 	}
1091 
1092 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1093 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1094 	return B_OK;
1095 }
1096 
1097 
1098 status_t
1099 TCPEndpoint::SendAcknowledge(bool force)
1100 {
1101 	return _SendQueued(force, 0);
1102 }
1103 
1104 
1105 void
1106 TCPEndpoint::_StartPersistTimer()
1107 {
1108 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
1109 }
1110 
1111 
1112 void
1113 TCPEndpoint::_EnterTimeWait()
1114 {
1115 	TRACE("_EnterTimeWait()\n");
1116 
1117 	_CancelConnectionTimers();
1118 
1119 	if (fState == TIME_WAIT && IsLocal()) {
1120 		// we do not use TIME_WAIT state for local connections
1121 		fFlags |= FLAG_DELETE_ON_CLOSE;
1122 		return;
1123 	}
1124 
1125 	_UpdateTimeWait();
1126 }
1127 
1128 
1129 void
1130 TCPEndpoint::_UpdateTimeWait()
1131 {
1132 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1133 }
1134 
1135 
1136 void
1137 TCPEndpoint::_CancelConnectionTimers()
1138 {
1139 	gStackModule->cancel_timer(&fRetransmitTimer);
1140 	gStackModule->cancel_timer(&fPersistTimer);
1141 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1142 }
1143 
1144 
1145 /*!	Sends the FIN flag to the peer when the connection is still open.
1146 	Moves the endpoint to the next state depending on where it was.
1147 */
1148 status_t
1149 TCPEndpoint::_Disconnect(bool closing)
1150 {
1151 	tcp_state previousState = fState;
1152 
1153 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1154 		fState = FINISH_SENT;
1155 	else if (fState == FINISH_RECEIVED)
1156 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1157 	else
1158 		return B_OK;
1159 
1160 	T(State(this));
1161 
1162 	status_t status = _SendQueued();
1163 	if (status != B_OK) {
1164 		fState = previousState;
1165 		T(State(this));
1166 		return status;
1167 	}
1168 
1169 	return B_OK;
1170 }
1171 
1172 
1173 void
1174 TCPEndpoint::_MarkEstablished()
1175 {
1176 	fState = ESTABLISHED;
1177 	T(State(this));
1178 
1179 	if (gSocketModule->has_parent(socket)) {
1180 		gSocketModule->set_connected(socket);
1181 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1182 	}
1183 
1184 	fSendList.Signal();
1185 }
1186 
1187 
1188 status_t
1189 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1190 {
1191 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1192 	// When investigating this, also have a look at _Close() and _HandleReset().
1193 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1194 		if (socket->error != B_OK)
1195 			return socket->error;
1196 
1197 		status_t status = fSendList.Wait(locker, timeout);
1198 		if (status < B_OK)
1199 			return status;
1200 	}
1201 
1202 	return B_OK;
1203 }
1204 
1205 
1206 //	#pragma mark - receive
1207 
1208 
1209 void
1210 TCPEndpoint::_Close()
1211 {
1212 	_CancelConnectionTimers();
1213 	fState = CLOSED;
1214 	T(State(this));
1215 
1216 	fFlags |= FLAG_DELETE_ON_CLOSE;
1217 
1218 	fSendList.Signal();
1219 	_NotifyReader();
1220 
1221 	if (gSocketModule->has_parent(socket)) {
1222 		// We still have a parent - obviously, we haven't been accepted yet,
1223 		// so no one could ever close us.
1224 		_CancelConnectionTimers();
1225 		gSocketModule->set_aborted(socket);
1226 	}
1227 }
1228 
1229 
1230 void
1231 TCPEndpoint::_HandleReset(status_t error)
1232 {
1233 	socket->error = error;
1234 	_Close();
1235 
1236 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1237 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1238 }
1239 
1240 
1241 void
1242 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1243 {
1244 	if (++fDuplicateAcknowledgeCount < 3)
1245 		return;
1246 
1247 	if (fDuplicateAcknowledgeCount == 3) {
1248 		_ResetSlowStart();
1249 		fCongestionWindow = fSlowStartThreshold + 3
1250 			* fSendMaxSegmentSize;
1251 		fSendNext = segment.acknowledge;
1252 	} else if (fDuplicateAcknowledgeCount > 3)
1253 		fCongestionWindow += fSendMaxSegmentSize;
1254 
1255 	_SendQueued();
1256 }
1257 
1258 
1259 void
1260 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1261 	size_t segmentLength)
1262 {
1263 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1264 		tcp_sequence sequence(segment.sequence);
1265 
1266 		if (fLastAcknowledgeSent >= sequence
1267 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1268 			fReceivedTimestamp = segment.timestamp_value;
1269 	}
1270 }
1271 
1272 
1273 ssize_t
1274 TCPEndpoint::_AvailableData() const
1275 {
1276 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1277 	//       the application of FLAG_NO_RECEIVE in listen()ing
1278 	//       sockets.
1279 	if (fState == LISTEN)
1280 		return gSocketModule->count_connected(socket);
1281 	if (fState == SYNCHRONIZE_SENT)
1282 		return 0;
1283 
1284 	ssize_t availableData = fReceiveQueue.Available();
1285 
1286 	if (availableData == 0 && !_ShouldReceive())
1287 		return ENOTCONN;
1288 
1289 	return availableData;
1290 }
1291 
1292 
1293 void
1294 TCPEndpoint::_NotifyReader()
1295 {
1296 	fReceiveList.Signal();
1297 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1298 }
1299 
1300 
1301 bool
1302 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1303 {
1304 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1305 		// Remember the position of the finish received flag
1306 		fFinishReceived = true;
1307 		fFinishReceivedAt = segment.sequence + buffer->size;
1308 	}
1309 
1310 	fReceiveQueue.Add(buffer, segment.sequence);
1311 	fReceiveNext = fReceiveQueue.NextSequence();
1312 
1313 	if (fFinishReceived) {
1314 		// Set or reset the finish flag on the current segment
1315 		if (fReceiveNext < fFinishReceivedAt)
1316 			segment.flags &= ~TCP_FLAG_FINISH;
1317 		else
1318 			segment.flags |= TCP_FLAG_FINISH;
1319 	}
1320 
1321 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1322 		fReceiveNext.Number(), fReceiveQueue.Available());
1323 
1324 	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1325 		fReceiveQueue.SetPushPointer();
1326 
1327 	return fReceiveQueue.Available() > 0;
1328 }
1329 
1330 
1331 void
1332 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1333 {
1334 	fInitialReceiveSequence = segment.sequence;
1335 	fFinishReceived = false;
1336 
1337 	// count the received SYN
1338 	segment.sequence++;
1339 
1340 	fReceiveNext = segment.sequence;
1341 	fReceiveQueue.SetInitialSequence(segment.sequence);
1342 
1343 	if ((fOptions & TCP_NOOPT) == 0) {
1344 		if (segment.max_segment_size > 0)
1345 			fSendMaxSegmentSize = segment.max_segment_size;
1346 
1347 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1348 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1349 			fSendWindowShift = segment.window_shift;
1350 		} else {
1351 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1352 			fReceiveWindowShift = 0;
1353 		}
1354 
1355 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1356 			fFlags |= FLAG_OPTION_TIMESTAMP;
1357 			fReceivedTimestamp = segment.timestamp_value;
1358 		} else
1359 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1360 	}
1361 
1362 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1363 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1364 }
1365 
1366 
1367 bool
1368 TCPEndpoint::_ShouldReceive() const
1369 {
1370 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1371 		return false;
1372 
1373 	return fState == ESTABLISHED || fState == FINISH_SENT
1374 		|| fState == FINISH_ACKNOWLEDGED;
1375 }
1376 
1377 
1378 int32
1379 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1380 	net_buffer* buffer)
1381 {
1382 	MutexLocker _(fLock);
1383 
1384 	// TODO error checking
1385 	ProtocolSocket::Open();
1386 
1387 	fState = SYNCHRONIZE_RECEIVED;
1388 	T(Spawn(parent, this));
1389 
1390 	fManager = parent->fManager;
1391 
1392 	LocalAddress().SetTo(buffer->destination);
1393 	PeerAddress().SetTo(buffer->source);
1394 
1395 	TRACE("Spawn()");
1396 
1397 	// TODO: proper error handling!
1398 	if (fManager->BindChild(this) != B_OK) {
1399 		T(Error(this, "binding failed", __LINE__));
1400 		return DROP;
1401 	}
1402 	if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1403 		T(Error(this, "prepare send faild", __LINE__));
1404 		return DROP;
1405 	}
1406 
1407 	fOptions = parent->fOptions;
1408 	fAcceptSemaphore = parent->fAcceptSemaphore;
1409 
1410 	_PrepareReceivePath(segment);
1411 
1412 	// send SYN+ACK
1413 	if (_SendQueued() != B_OK) {
1414 		T(Error(this, "sending failed", __LINE__));
1415 		return DROP;
1416 	}
1417 
1418 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1419 		// we handled this flag now, it must not be set for further processing
1420 
1421 	return _Receive(segment, buffer);
1422 }
1423 
1424 
1425 int32
1426 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1427 {
1428 	TRACE("ListenReceive()");
1429 
1430 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1431 	// but the error behaviour differs
1432 	if (segment.flags & TCP_FLAG_RESET)
1433 		return DROP;
1434 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1435 		return DROP | RESET;
1436 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1437 		return DROP;
1438 
1439 	// TODO: drop broadcast/multicast
1440 
1441 	// spawn new endpoint for accept()
1442 	net_socket* newSocket;
1443 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1444 		T(Error(this, "spawning failed", __LINE__));
1445 		return DROP;
1446 	}
1447 
1448 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1449 		segment, buffer);
1450 }
1451 
1452 
1453 int32
1454 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1455 	net_buffer *buffer)
1456 {
1457 	TRACE("_SynchronizeSentReceive()");
1458 
1459 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1460 		&& (fInitialSendSequence >= segment.acknowledge
1461 			|| fSendMax < segment.acknowledge))
1462 		return DROP | RESET;
1463 
1464 	if (segment.flags & TCP_FLAG_RESET) {
1465 		_HandleReset(ECONNREFUSED);
1466 		return DROP;
1467 	}
1468 
1469 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1470 		return DROP;
1471 
1472 	fSendUnacknowledged = segment.acknowledge;
1473 	_PrepareReceivePath(segment);
1474 
1475 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1476 		_MarkEstablished();
1477 	} else {
1478 		// simultaneous open
1479 		fState = SYNCHRONIZE_RECEIVED;
1480 		T(State(this));
1481 	}
1482 
1483 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1484 		// we handled this flag now, it must not be set for further processing
1485 
1486 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1487 }
1488 
1489 
1490 int32
1491 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1492 {
1493 	uint32 advertisedWindow = (uint32)segment.advertised_window
1494 		<< fSendWindowShift;
1495 	size_t segmentLength = buffer->size;
1496 
1497 	// First, handle the most common case for uni-directional data transfer
1498 	// (known as header prediction - the segment must not change the window,
1499 	// and must be the expected sequence, and contain no control flags)
1500 
1501 	if (fState == ESTABLISHED
1502 		&& segment.AcknowledgeOnly()
1503 		&& fReceiveNext == segment.sequence
1504 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1505 		&& fSendNext == fSendMax) {
1506 		_UpdateTimestamps(segment, segmentLength);
1507 
1508 		if (segmentLength == 0) {
1509 			// this is a pure acknowledge segment - we're on the sending end
1510 			if (fSendUnacknowledged < segment.acknowledge
1511 				&& fSendMax >= segment.acknowledge) {
1512 				_Acknowledged(segment);
1513 				return DROP;
1514 			}
1515 		} else if (segment.acknowledge == fSendUnacknowledged
1516 			&& fReceiveQueue.IsContiguous()
1517 			&& fReceiveQueue.Free() >= segmentLength
1518 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1519 			if (_AddData(segment, buffer))
1520 				_NotifyReader();
1521 
1522 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1523 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1524 		}
1525 	}
1526 
1527 	// The fast path was not applicable, so we continue with the standard
1528 	// processing of the incoming segment
1529 
1530 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1531 
1532 	if (fState != CLOSED && fState != TIME_WAIT) {
1533 		// Check sequence number
1534 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1535 				fReceiveWindow)) {
1536 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1537 				fReceiveNext.Number(), fReceiveWindow);
1538 			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1539 				// TODO: this doesn't look right - review!
1540 				return DROP;
1541 			}
1542 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1543 		}
1544 	}
1545 
1546 	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1547 		// Is this a valid reset?
1548 		// We generally ignore resets in time wait state (see RFC 1337)
1549 		if (fLastAcknowledgeSent <= segment.sequence
1550 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1551 				+ fReceiveWindow)
1552 			&& fState != TIME_WAIT) {
1553 			status_t error;
1554 			if (fState == SYNCHRONIZE_RECEIVED)
1555 				error = ECONNREFUSED;
1556 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1557 				error = ENOTCONN;
1558 			else
1559 				error = ECONNRESET;
1560 
1561 			_HandleReset(error);
1562 		}
1563 
1564 		return DROP;
1565 	}
1566 
1567 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1568 		|| (fState == SYNCHRONIZE_RECEIVED
1569 			&& (fInitialReceiveSequence > segment.sequence
1570 				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1571 					&& (fSendUnacknowledged > segment.acknowledge
1572 						|| fSendMax < segment.acknowledge))))) {
1573 		// reset the connection - either the initial SYN was faulty, or we
1574 		// received a SYN within the data stream
1575 		return DROP | RESET;
1576 	}
1577 
1578 	// TODO: Check this! Why do we advertize a window outside of what we should
1579 	// buffer?
1580 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1581 		// the window must not shrink
1582 
1583 	// trim buffer to be within the receive window
1584 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1585 	if (drop > 0) {
1586 		if ((uint32)drop > buffer->size
1587 			|| ((uint32)drop == buffer->size
1588 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1589 			// don't accidently remove a FIN we shouldn't remove
1590 			segment.flags &= ~TCP_FLAG_FINISH;
1591 			drop = buffer->size;
1592 		}
1593 
1594 		// remove duplicate data at the start
1595 		TRACE("* remove %ld bytes from the start", drop);
1596 		gBufferModule->remove_header(buffer, drop);
1597 		segment.sequence += drop;
1598 	}
1599 
1600 	int32 action = KEEP;
1601 
1602 	drop = (int32)(segment.sequence + buffer->size
1603 		- (fReceiveNext + fReceiveWindow)).Number();
1604 	if (drop > 0) {
1605 		// remove data exceeding our window
1606 		if ((uint32)drop >= buffer->size) {
1607 			// if we can accept data, or the segment is not what we'd expect,
1608 			// drop the segment (an immediate acknowledge is always triggered)
1609 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1610 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1611 
1612 			action |= IMMEDIATE_ACKNOWLEDGE;
1613 		}
1614 
1615 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1616 			// we need to remove the finish, too, as part of the data
1617 			drop--;
1618 		}
1619 
1620 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1621 		TRACE("* remove %ld bytes from the end", drop);
1622 		gBufferModule->remove_trailer(buffer, drop);
1623 	}
1624 
1625 #ifdef TRACE_TCP
1626 	if (advertisedWindow > fSendWindow) {
1627 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1628 			advertisedWindow);
1629 	}
1630 #endif
1631 
1632 	fSendWindow = advertisedWindow;
1633 	if (advertisedWindow > fSendMaxWindow)
1634 		fSendMaxWindow = advertisedWindow;
1635 
1636 	// Then look at the acknowledgement for any updates
1637 
1638 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1639 		// process acknowledged data
1640 		if (fState == SYNCHRONIZE_RECEIVED)
1641 			_MarkEstablished();
1642 
1643 		if (fSendMax < segment.acknowledge)
1644 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1645 
1646 		if (segment.acknowledge < fSendUnacknowledged) {
1647 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1648 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1649 				TRACE("Receive(): duplicate ack!");
1650 
1651 				_DuplicateAcknowledge(segment);
1652 			}
1653 
1654 			return DROP;
1655 		} else {
1656 			// this segment acknowledges in flight data
1657 
1658 			if (fDuplicateAcknowledgeCount >= 3) {
1659 				// deflate the window.
1660 				fCongestionWindow = fSlowStartThreshold;
1661 			}
1662 
1663 			fDuplicateAcknowledgeCount = 0;
1664 
1665 			if (fSendMax == segment.acknowledge)
1666 				TRACE("Receive(): all inflight data ack'd!");
1667 
1668 			if (segment.acknowledge > fSendQueue.LastSequence()
1669 					&& fState > ESTABLISHED) {
1670 				TRACE("Receive(): FIN has been acknowledged!");
1671 
1672 				switch (fState) {
1673 					case FINISH_SENT:
1674 						fState = FINISH_ACKNOWLEDGED;
1675 						T(State(this));
1676 						break;
1677 					case CLOSING:
1678 						fState = TIME_WAIT;
1679 						T(State(this));
1680 						_EnterTimeWait();
1681 						return DROP;
1682 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1683 						_Close();
1684 						break;
1685 
1686 					default:
1687 						break;
1688 				}
1689 			}
1690 
1691 			if (fState != CLOSED)
1692 				_Acknowledged(segment);
1693 		}
1694 	}
1695 
1696 	if (segment.flags & TCP_FLAG_URGENT) {
1697 		if (fState == ESTABLISHED || fState == FINISH_SENT
1698 			|| fState == FINISH_ACKNOWLEDGED) {
1699 			// TODO: Handle urgent data:
1700 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1701 			//  - signal the user that urgent data is available (SIGURG)
1702 		}
1703 	}
1704 
1705 	bool notify = false;
1706 
1707 	// The buffer may be freed if its data is added to the queue, so cache
1708 	// the size as we still need it later.
1709 	uint32 bufferSize = buffer->size;
1710 
1711 	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1712 		&& _ShouldReceive())
1713 		notify = _AddData(segment, buffer);
1714 	else {
1715 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1716 			fReceiveNext += buffer->size;
1717 
1718 		action = (action & ~KEEP) | DROP;
1719 	}
1720 
1721 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1722 		segmentLength++;
1723 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1724 			TRACE("Receive(): peer is finishing connection!");
1725 			fReceiveNext++;
1726 			notify = true;
1727 
1728 			// FIN implies PUSH
1729 			fReceiveQueue.SetPushPointer();
1730 
1731 			// we'll reply immediately to the FIN if we are not
1732 			// transitioning to TIME WAIT so we immediatly ACK it.
1733 			action |= IMMEDIATE_ACKNOWLEDGE;
1734 
1735 			// other side is closing connection; change states
1736 			switch (fState) {
1737 				case ESTABLISHED:
1738 				case SYNCHRONIZE_RECEIVED:
1739 					fState = FINISH_RECEIVED;
1740 					T(State(this));
1741 					break;
1742 				case FINISH_SENT:
1743 					// simultaneous close
1744 					fState = CLOSING;
1745 					T(State(this));
1746 					break;
1747 				case FINISH_ACKNOWLEDGED:
1748 					fState = TIME_WAIT;
1749 					T(State(this));
1750 					_EnterTimeWait();
1751 					break;
1752 				case TIME_WAIT:
1753 					_UpdateTimeWait();
1754 					break;
1755 
1756 				default:
1757 					break;
1758 			}
1759 		}
1760 	}
1761 
1762 	if (notify)
1763 		_NotifyReader();
1764 
1765 	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1766 		action |= ACKNOWLEDGE;
1767 
1768 	_UpdateTimestamps(segment, segmentLength);
1769 
1770 	TRACE("Receive() Action %ld", action);
1771 
1772 	return action;
1773 }
1774 
1775 
1776 int32
1777 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1778 {
1779 	MutexLocker locker(fLock);
1780 
1781 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
1782 		"\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
1783 		buffer, buffer->size, PrintAddress(buffer->source),
1784 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1785 		segment.acknowledge,
1786 		(uint32)segment.advertised_window << fSendWindowShift);
1787 	T(Receive(this, segment,
1788 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1789 	int32 segmentAction = DROP;
1790 
1791 	switch (fState) {
1792 		case LISTEN:
1793 			segmentAction = _ListenReceive(segment, buffer);
1794 			break;
1795 
1796 		case SYNCHRONIZE_SENT:
1797 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1798 			break;
1799 
1800 		case SYNCHRONIZE_RECEIVED:
1801 		case ESTABLISHED:
1802 		case FINISH_RECEIVED:
1803 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1804 		case FINISH_SENT:
1805 		case FINISH_ACKNOWLEDGED:
1806 		case CLOSING:
1807 		case TIME_WAIT:
1808 		case CLOSED:
1809 			segmentAction = _Receive(segment, buffer);
1810 			break;
1811 	}
1812 
1813 	// process acknowledge action as asked for by the *Receive() method
1814 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1815 		SendAcknowledge(true);
1816 	else if (segmentAction & ACKNOWLEDGE)
1817 		DelayedAcknowledge();
1818 
1819 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1820 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1821 		locker.Unlock();
1822 		gSocketModule->release_socket(socket);
1823 	}
1824 
1825 	return segmentAction;
1826 }
1827 
1828 
1829 //	#pragma mark - send
1830 
1831 
1832 inline uint8
1833 TCPEndpoint::_CurrentFlags()
1834 {
1835 	// we don't set FLAG_FINISH here, instead we do it
1836 	// conditionally below depending if we are sending
1837 	// the last bytes of the send queue.
1838 
1839 	switch (fState) {
1840 		case CLOSED:
1841 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1842 
1843 		case SYNCHRONIZE_SENT:
1844 			return TCP_FLAG_SYNCHRONIZE;
1845 		case SYNCHRONIZE_RECEIVED:
1846 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1847 
1848 		case ESTABLISHED:
1849 		case FINISH_RECEIVED:
1850 		case FINISH_ACKNOWLEDGED:
1851 		case TIME_WAIT:
1852 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1853 		case FINISH_SENT:
1854 		case CLOSING:
1855 			return TCP_FLAG_ACKNOWLEDGE;
1856 
1857 		default:
1858 			return 0;
1859 	}
1860 }
1861 
1862 
1863 inline bool
1864 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1865 	uint32 segmentMaxSize, uint32 flightSize)
1866 {
1867 	if (length > 0) {
1868 		// Avoid the silly window syndrome - we only send a segment in case:
1869 		// - we have a full segment to send, or
1870 		// - we're at the end of our buffer queue, or
1871 		// - the buffer is at least larger than half of the maximum send window,
1872 		//   or
1873 		// - we're retransmitting data
1874 		if (length == segmentMaxSize
1875 			|| (fOptions & TCP_NODELAY) != 0
1876 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1877 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1878 			return true;
1879 	}
1880 
1881 	// check if we need to send a window update to the peer
1882 	if (segment.advertised_window > 0) {
1883 		// correct the window to take into account what already has been advertised
1884 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1885 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
1886 
1887 		// if we can advertise a window larger than twice the maximum segment
1888 		// size, or half the maximum buffer size we send a window update
1889 		if (window >= (fReceiveMaxSegmentSize << 1)
1890 			|| window >= (socket->receive.buffer_size >> 1))
1891 			return true;
1892 	}
1893 
1894 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1895 			| TCP_FLAG_RESET)) != 0)
1896 		return true;
1897 
1898 	// We do have urgent data pending
1899 	if (fSendUrgentOffset > fSendNext)
1900 		return true;
1901 
1902 	// there is no reason to send a segment just now
1903 	return false;
1904 }
1905 
1906 
1907 status_t
1908 TCPEndpoint::_SendQueued(bool force)
1909 {
1910 	return _SendQueued(force, fSendWindow);
1911 }
1912 
1913 
1914 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1915 	specific flags that need to be sent.
1916 */
1917 status_t
1918 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1919 {
1920 	if (fRoute == NULL)
1921 		return B_ERROR;
1922 
1923 	// in passive state?
1924 	if (fState == LISTEN)
1925 		return B_ERROR;
1926 
1927 	tcp_segment_header segment(_CurrentFlags());
1928 
1929 	if ((fOptions & TCP_NOOPT) == 0) {
1930 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1931 			segment.options |= TCP_HAS_TIMESTAMPS;
1932 			segment.timestamp_reply = fReceivedTimestamp;
1933 			segment.timestamp_value = tcp_now();
1934 		}
1935 
1936 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1937 			&& fSendNext == fInitialSendSequence) {
1938 			// add connection establishment options
1939 			segment.max_segment_size = fReceiveMaxSegmentSize;
1940 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1941 				segment.options |= TCP_HAS_WINDOW_SCALE;
1942 				segment.window_shift = fReceiveWindowShift;
1943 			}
1944 		}
1945 	}
1946 
1947 	size_t availableBytes = fReceiveQueue.Free();
1948 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1949 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1950 	else
1951 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1952 
1953 	segment.acknowledge = fReceiveNext.Number();
1954 
1955 	// Process urgent data
1956 	if (fSendUrgentOffset > fSendNext) {
1957 		segment.flags |= TCP_FLAG_URGENT;
1958 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
1959 	} else {
1960 		fSendUrgentOffset = fSendUnacknowledged.Number();
1961 			// Keep urgent offset updated, so that it doesn't reach into our
1962 			// send window on overlap
1963 		segment.urgent_offset = 0;
1964 	}
1965 
1966 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1967 		sendWindow = fCongestionWindow;
1968 
1969 	// fSendUnacknowledged
1970 	//  |    fSendNext      fSendMax
1971 	//  |        |              |
1972 	//  v        v              v
1973 	//  -----------------------------------
1974 	//  | effective window           |
1975 	//  -----------------------------------
1976 
1977 	// Flight size represents the window of data which is currently in the
1978 	// ether. We should never send data such as the flight size becomes larger
1979 	// than the effective window. Note however that the effective window may be
1980 	// reduced (by congestion for instance), so at some point in time flight
1981 	// size may be larger than the currently calculated window.
1982 
1983 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1984 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
1985 
1986 	if (consumedWindow > sendWindow) {
1987 		sendWindow = 0;
1988 		// TODO: enter persist state? try to get a window update.
1989 	} else
1990 		sendWindow -= consumedWindow;
1991 
1992 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1993 		// send one byte of data to ask for a window update
1994 		// (triggered by the persist timer)
1995 		sendWindow = 1;
1996 	}
1997 
1998 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1999 	tcp_sequence previousSendNext = fSendNext;
2000 
2001 	do {
2002 		uint32 segmentMaxSize = fSendMaxSegmentSize
2003 			- tcp_options_length(segment);
2004 		uint32 segmentLength = min_c(length, segmentMaxSize);
2005 
2006 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
2007 			if (state_needs_finish(fState))
2008 				segment.flags |= TCP_FLAG_FINISH;
2009 			if (length > 0)
2010 				segment.flags |= TCP_FLAG_PUSH;
2011 		}
2012 
2013 		// Determine if we should really send this segment
2014 		if (!force && !_ShouldSendSegment(segment, segmentLength,
2015 				segmentMaxSize, flightSize)) {
2016 			if (fSendQueue.Available()
2017 				&& !gStackModule->is_timer_active(&fPersistTimer)
2018 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2019 				_StartPersistTimer();
2020 			break;
2021 		}
2022 
2023 		net_buffer *buffer = gBufferModule->create(256);
2024 		if (buffer == NULL)
2025 			return B_NO_MEMORY;
2026 
2027 		status_t status = B_OK;
2028 		if (segmentLength > 0)
2029 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2030 		if (status < B_OK) {
2031 			gBufferModule->free(buffer);
2032 			return status;
2033 		}
2034 
2035 		LocalAddress().CopyTo(buffer->source);
2036 		PeerAddress().CopyTo(buffer->destination);
2037 
2038 		uint32 size = buffer->size;
2039 		segment.sequence = fSendNext.Number();
2040 
2041 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
2042 			"\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n"
2043 			"\tlen %lu first %lu last %lu",
2044 			buffer, buffer->size, PrintAddress(buffer->source),
2045 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
2046 			segment.acknowledge, segment.advertised_window,
2047 			fCongestionWindow, fSlowStartThreshold, segmentLength,
2048 			fSendQueue.FirstSequence().Number(),
2049 			fSendQueue.LastSequence().Number());
2050 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2051 			fSendQueue.LastSequence()));
2052 
2053 		PROBE(buffer, sendWindow);
2054 		sendWindow -= buffer->size;
2055 
2056 		status = add_tcp_header(AddressModule(), segment, buffer);
2057 		if (status != B_OK) {
2058 			gBufferModule->free(buffer);
2059 			return status;
2060 		}
2061 
2062 		// Update send status - we need to do this before we send the data
2063 		// for local connections as the answer is directly handled
2064 
2065 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2066 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
2067 			segment.max_segment_size = 0;
2068 			size++;
2069 		}
2070 
2071 		if (segment.flags & TCP_FLAG_FINISH)
2072 			size++;
2073 
2074 		uint32 sendMax = fSendMax.Number();
2075 		fSendNext += size;
2076 		if (fSendMax < fSendNext)
2077 			fSendMax = fSendNext;
2078 
2079 		fReceiveMaxAdvertised = fReceiveNext
2080 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
2081 
2082 		status = next->module->send_routed_data(next, fRoute, buffer);
2083 		if (status < B_OK) {
2084 			gBufferModule->free(buffer);
2085 
2086 			fSendNext = segment.sequence;
2087 			fSendMax = sendMax;
2088 				// restore send status
2089 			return status;
2090 		}
2091 
2092 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2093 			fLastAcknowledgeSent = segment.acknowledge;
2094 
2095 		length -= segmentLength;
2096 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2097 			| TCP_FLAG_FINISH);
2098 	} while (length > 0);
2099 
2100 	// if we sent data from the beggining of the send queue,
2101 	// start the retransmition timer
2102 	if (previousSendNext == fSendUnacknowledged
2103 		&& fSendNext > previousSendNext) {
2104 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
2105 			fRetransmitTimeout);
2106 
2107 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2108 	}
2109 
2110 	return B_OK;
2111 }
2112 
2113 
2114 int
2115 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2116 {
2117 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2118 }
2119 
2120 
2121 status_t
2122 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2123 {
2124 	if (fRoute == NULL) {
2125 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2126 		if (fRoute == NULL)
2127 			return ENETUNREACH;
2128 
2129 		if ((fRoute->flags & RTF_LOCAL) != 0)
2130 			fFlags |= FLAG_LOCAL;
2131 	}
2132 
2133 	// make sure connection does not already exist
2134 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2135 		fRoute->interface_address->local);
2136 	if (status < B_OK)
2137 		return status;
2138 
2139 	fInitialSendSequence = system_time() >> 4;
2140 	fSendNext = fInitialSendSequence;
2141 	fSendUnacknowledged = fInitialSendSequence;
2142 	fSendMax = fInitialSendSequence;
2143 	fSendUrgentOffset = fInitialSendSequence;
2144 
2145 	// we are counting the SYN here
2146 	fSendQueue.SetInitialSequence(fSendNext + 1);
2147 
2148 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2149 
2150 	// Compute the window shift we advertise to our peer - if it doesn't support
2151 	// this option, this will be reset to 0 (when its SYN is received)
2152 	fReceiveWindowShift = 0;
2153 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2154 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2155 		fReceiveWindowShift++;
2156 	}
2157 
2158 	return B_OK;
2159 }
2160 
2161 
2162 void
2163 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2164 {
2165 	size_t previouslyUsed = fSendQueue.Used();
2166 
2167 	fSendQueue.RemoveUntil(segment.acknowledge);
2168 	fSendUnacknowledged = segment.acknowledge;
2169 
2170 	if (fSendNext < fSendUnacknowledged)
2171 		fSendNext = fSendUnacknowledged;
2172 
2173 	if (fSendUnacknowledged == fSendMax)
2174 		gStackModule->cancel_timer(&fRetransmitTimer);
2175 
2176 	if (fSendQueue.Used() < previouslyUsed) {
2177 		// this ACK acknowledged data
2178 
2179 		if (segment.options & TCP_HAS_TIMESTAMPS)
2180 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2181 		else {
2182 			// TODO: Fallback to RFC 793 type estimation
2183 		}
2184 
2185 		if (is_writable(fState)) {
2186 			// notify threads waiting on the socket to become writable again
2187 			fSendList.Signal();
2188 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
2189 		}
2190 
2191 		if (fCongestionWindow < fSlowStartThreshold)
2192 			fCongestionWindow += fSendMaxSegmentSize;
2193 	}
2194 
2195 	if (fCongestionWindow >= fSlowStartThreshold) {
2196 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2197 
2198 		if (increment < fCongestionWindow)
2199 			increment = 1;
2200 		else
2201 			increment /= fCongestionWindow;
2202 
2203 		fCongestionWindow += increment;
2204 	}
2205 
2206 	// if there is data left to be send, send it now
2207 	if (fSendQueue.Used() > 0)
2208 		_SendQueued();
2209 }
2210 
2211 
2212 void
2213 TCPEndpoint::_Retransmit()
2214 {
2215 	TRACE("Retransmit()");
2216 	_ResetSlowStart();
2217 	fSendNext = fSendUnacknowledged;
2218 	_SendQueued();
2219 }
2220 
2221 
2222 void
2223 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2224 {
2225 	int32 rtt = roundTripTime;
2226 
2227 	// "smooth" round trip time as per Van Jacobson
2228 	rtt -= fRoundTripTime / 8;
2229 	fRoundTripTime += rtt;
2230 	if (rtt < 0)
2231 		rtt = -rtt;
2232 	rtt -= fRoundTripDeviation / 4;
2233 	fRoundTripDeviation += rtt;
2234 
2235 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2236 		* kTimestampFactor;
2237 
2238 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
2239 		roundTripTime);
2240 }
2241 
2242 
2243 void
2244 TCPEndpoint::_ResetSlowStart()
2245 {
2246 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2247 		2 * fSendMaxSegmentSize);
2248 	fCongestionWindow = fSendMaxSegmentSize;
2249 }
2250 
2251 
2252 //	#pragma mark - timer
2253 
2254 
2255 /*static*/ void
2256 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2257 {
2258 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2259 	T(Timer(endpoint, "retransmit"));
2260 
2261 	MutexLocker locker(endpoint->fLock);
2262 	if (!locker.IsLocked())
2263 		return;
2264 
2265 	endpoint->_Retransmit();
2266 }
2267 
2268 
2269 /*static*/ void
2270 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2271 {
2272 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2273 	T(Timer(endpoint, "persist"));
2274 
2275 	MutexLocker locker(endpoint->fLock);
2276 	if (!locker.IsLocked())
2277 		return;
2278 
2279 	// the timer might not have been canceled early enough
2280 	if (endpoint->State() == CLOSED)
2281 		return;
2282 
2283 	endpoint->_SendQueued(true);
2284 }
2285 
2286 
2287 /*static*/ void
2288 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2289 {
2290 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2291 	T(Timer(endpoint, "delayed ack"));
2292 
2293 	MutexLocker locker(endpoint->fLock);
2294 	if (!locker.IsLocked())
2295 		return;
2296 
2297 	// the timer might not have been canceled early enough
2298 	if (endpoint->State() == CLOSED)
2299 		return;
2300 
2301 	endpoint->SendAcknowledge(true);
2302 }
2303 
2304 
2305 /*static*/ void
2306 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2307 {
2308 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2309 	T(Timer(endpoint, "time-wait"));
2310 
2311 	MutexLocker locker(endpoint->fLock);
2312 	if (!locker.IsLocked())
2313 		return;
2314 
2315 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2316 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2317 		return;
2318 	}
2319 
2320 	locker.Unlock();
2321 
2322 	gSocketModule->release_socket(endpoint->socket);
2323 }
2324 
2325 
2326 //	#pragma mark -
2327 
2328 
2329 void
2330 TCPEndpoint::Dump() const
2331 {
2332 	kprintf("TCP endpoint %p\n", this);
2333 	kprintf("  state: %s\n", name_for_state(fState));
2334 	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2335 #if KDEBUG
2336 	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2337 #endif
2338 	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2339 	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2340 	kprintf("  send\n");
2341 	kprintf("    window shift: %u\n", fSendWindowShift);
2342 	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2343 		fSendUnacknowledged.Number());
2344 	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2345 	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2346 	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2347 	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2348 	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2349 	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2350 	kprintf("    queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
2351 #if DEBUG_BUFFER_QUEUE
2352 	fSendQueue.Dump();
2353 #endif
2354 	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2355 		fLastAcknowledgeSent.Number());
2356 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2357 		fInitialSendSequence.Number());
2358 	kprintf("  receive\n");
2359 	kprintf("    window shift: %u\n", fReceiveWindowShift);
2360 	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2361 	kprintf("    max advertised: %" B_PRIu32 "\n",
2362 		fReceiveMaxAdvertised.Number());
2363 	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2364 	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2365 	kprintf("    queue: %lu / %lu\n", fReceiveQueue.Available(),
2366 		fReceiveQueue.Size());
2367 #if DEBUG_BUFFER_QUEUE
2368 	fReceiveQueue.Dump();
2369 #endif
2370 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2371 		fInitialReceiveSequence.Number());
2372 	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2373 		fDuplicateAcknowledgeCount);
2374 	kprintf("  round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2375 		fRoundTripTime, fRoundTripDeviation);
2376 	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2377 	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2378 	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2379 }
2380 
2381